pdfjs-reader-core 0.3.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +265 -0
- package/dist/index.cjs +2528 -96
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1084 -3
- package/dist/index.d.ts +1084 -3
- package/dist/index.js +2501 -96
- package/dist/index.js.map +1 -1
- package/package.json +21 -17
- package/LICENSE +0 -21
package/README.md
CHANGED
|
@@ -1060,6 +1060,271 @@ Customize loading and error UI components.
|
|
|
1060
1060
|
|
|
1061
1061
|
---
|
|
1062
1062
|
|
|
1063
|
+
## 14. Cinematic Tutor Mode (v0.4.0+)
|
|
1064
|
+
|
|
1065
|
+
Pair a voice/LLM narration agent with the PDF: as the agent speaks, the viewer
|
|
1066
|
+
synchronises on-page visuals — spotlights, underlines, highlights, pulses,
|
|
1067
|
+
callouts, boxes, labels, ghost references, and camera zooms — so the reader
|
|
1068
|
+
sees the page react like a produced teaching video.
|
|
1069
|
+
|
|
1070
|
+
The feature is packaged as a single component, `TutorModeContainer`, that fills
|
|
1071
|
+
its parent 100 % width and height and shows **only** the PDF plus overlays.
|
|
1072
|
+
No sidebars, no dev toolbar, no inspector UI ships in the production bundle.
|
|
1073
|
+
|
|
1074
|
+
### Quick start
|
|
1075
|
+
|
|
1076
|
+
```tsx
|
|
1077
|
+
import {
|
|
1078
|
+
PDFViewerProvider,
|
|
1079
|
+
TutorModeContainer,
|
|
1080
|
+
createNarrationStore,
|
|
1081
|
+
loadDocumentWithCallbacks,
|
|
1082
|
+
useViewerStore,
|
|
1083
|
+
type NarrationStoreApi,
|
|
1084
|
+
type LlmConfig,
|
|
1085
|
+
type PageBBoxData,
|
|
1086
|
+
} from 'pdfjs-reader-core';
|
|
1087
|
+
import 'pdfjs-reader-core/styles.css';
|
|
1088
|
+
import { useEffect, useRef } from 'react';
|
|
1089
|
+
|
|
1090
|
+
function DocumentLoader({ url }: { url: string }) {
|
|
1091
|
+
const setDocument = useViewerStore((s) => s.setDocument);
|
|
1092
|
+
useEffect(() => {
|
|
1093
|
+
const { promise, cancel } = loadDocumentWithCallbacks({
|
|
1094
|
+
src: url,
|
|
1095
|
+
onDocumentReady: (doc) => setDocument(doc),
|
|
1096
|
+
onFirstPageReady: () => {},
|
|
1097
|
+
});
|
|
1098
|
+
promise.catch(() => {});
|
|
1099
|
+
return () => cancel();
|
|
1100
|
+
}, [url, setDocument]);
|
|
1101
|
+
return null;
|
|
1102
|
+
}
|
|
1103
|
+
|
|
1104
|
+
export function TutorScene({
|
|
1105
|
+
pdfUrl,
|
|
1106
|
+
bboxData,
|
|
1107
|
+
currentPage,
|
|
1108
|
+
currentChunk,
|
|
1109
|
+
llm,
|
|
1110
|
+
}: {
|
|
1111
|
+
pdfUrl: string;
|
|
1112
|
+
bboxData: PageBBoxData[];
|
|
1113
|
+
currentPage: number;
|
|
1114
|
+
/** The text the voice agent is currently speaking. Update reactively. */
|
|
1115
|
+
currentChunk: string | null;
|
|
1116
|
+
llm: LlmConfig;
|
|
1117
|
+
}) {
|
|
1118
|
+
const storeRef = useRef<NarrationStoreApi | null>(null);
|
|
1119
|
+
if (!storeRef.current) storeRef.current = createNarrationStore();
|
|
1120
|
+
|
|
1121
|
+
return (
|
|
1122
|
+
<div style={{ width: '100%', height: '100vh' }}>
|
|
1123
|
+
<PDFViewerProvider>
|
|
1124
|
+
<DocumentLoader url={pdfUrl} />
|
|
1125
|
+
<TutorModeContainer
|
|
1126
|
+
pageNumber={currentPage}
|
|
1127
|
+
bboxData={bboxData}
|
|
1128
|
+
narrationStore={storeRef.current}
|
|
1129
|
+
scale={1}
|
|
1130
|
+
currentChunk={currentChunk}
|
|
1131
|
+
llm={llm}
|
|
1132
|
+
minOverlayDurationMs={4000}
|
|
1133
|
+
/>
|
|
1134
|
+
</PDFViewerProvider>
|
|
1135
|
+
</div>
|
|
1136
|
+
);
|
|
1137
|
+
}
|
|
1138
|
+
```
|
|
1139
|
+
|
|
1140
|
+
That's it. Whenever `currentChunk` changes, the LLM director picks the
|
|
1141
|
+
matching block(s) on the current page and the engine plays a storyboard over
|
|
1142
|
+
the PDF. Typical end-to-end latency is <500 ms on a fast model (sentence-level
|
|
1143
|
+
sync is the design target; word-level TTS alignment is out of scope).
|
|
1144
|
+
|
|
1145
|
+
### What the visuals look like
|
|
1146
|
+
|
|
1147
|
+
The director can emit any combination of these effects per narration chunk:
|
|
1148
|
+
|
|
1149
|
+
| Effect | Typical trigger from narration |
|
|
1150
|
+
|---|---|
|
|
1151
|
+
| `camera` | Focus shifts to a new region (gentle re-centre + optional zoom) |
|
|
1152
|
+
| `spotlight` | "The key idea here is …" — dim everything except one block |
|
|
1153
|
+
| `underline` | Quoted phrase or word-by-word reading |
|
|
1154
|
+
| `highlight` | "Notice this keyword …" — amber marker sweep |
|
|
1155
|
+
| `pulse` | "Look at the diagram" — block scales in/out to catch the eye |
|
|
1156
|
+
| `callout` | Captions to figures / label to region — curved arrow + label |
|
|
1157
|
+
| `box` | "Under this section …" — blue frame around a structural region |
|
|
1158
|
+
| `label` | "This is the definition" — sticky-note pill tag |
|
|
1159
|
+
| `ghost_reference` | "As we saw on page 2 …" — floating card with the remote block |
|
|
1160
|
+
|
|
1161
|
+
Four opinionated **intent recipes** are baked into the prompt and the LLM
|
|
1162
|
+
composes from them or mixes freely: `define`, `point_out`, `compare`,
|
|
1163
|
+
`emphasize`.
|
|
1164
|
+
|
|
1165
|
+
### The bbox data contract
|
|
1166
|
+
|
|
1167
|
+
`bboxData: PageBBoxData[]` is a per-page list of typed blocks:
|
|
1168
|
+
|
|
1169
|
+
```ts
|
|
1170
|
+
interface PageBBoxData {
|
|
1171
|
+
id: string;
|
|
1172
|
+
page_number: number;
|
|
1173
|
+
page_text: string;
|
|
1174
|
+
page_dimensions: { width: number; height: number; dpi: number };
|
|
1175
|
+
blocks: Block[];
|
|
1176
|
+
created_at: string;
|
|
1177
|
+
}
|
|
1178
|
+
|
|
1179
|
+
interface Block {
|
|
1180
|
+
block_id: string; // stable id — referenced by storyboards
|
|
1181
|
+
bbox: [x1, y1, x2, y2]; // coords in the page's native DPI space
|
|
1182
|
+
text: string | null;
|
|
1183
|
+
type:
|
|
1184
|
+
| 'heading' | 'paragraph' | 'list_item'
|
|
1185
|
+
| 'figure' | 'figure_region' | 'caption'
|
|
1186
|
+
| 'table' | 'mcq_option';
|
|
1187
|
+
parent_id: string | null;
|
|
1188
|
+
confidence: number;
|
|
1189
|
+
reading_order: number;
|
|
1190
|
+
default_action: 'zoom_pan' | 'spotlight' | 'underline' | 'pulse';
|
|
1191
|
+
semantic_unit_id: string;
|
|
1192
|
+
}
|
|
1193
|
+
```
|
|
1194
|
+
|
|
1195
|
+
The LLM receives this inventory per call and MUST anchor every action to a
|
|
1196
|
+
real `block_id` — hallucinated targets are rejected by both the JSON schema
|
|
1197
|
+
and the runtime validator.
|
|
1198
|
+
|
|
1199
|
+
### Salvage guarantees
|
|
1200
|
+
|
|
1201
|
+
Small models occasionally emit malformed JSON, out-of-range numbers, or
|
|
1202
|
+
camera-only storyboards. The director pipeline defends against all three so
|
|
1203
|
+
the visuals never silently stall:
|
|
1204
|
+
|
|
1205
|
+
- **Whitespace collapse** — runs of ≥8 whitespace characters outside string
|
|
1206
|
+
literals are collapsed before `JSON.parse` (fixes tab-spam in `gpt-4.1-nano`).
|
|
1207
|
+
- **Range clamp** — `camera.scale`, `padding`, `dim_opacity`, `feather_px`,
|
|
1208
|
+
`draw_duration_ms`, `count`, `at_ms`, step `duration_ms` are clamped to
|
|
1209
|
+
their schema-legal range before zod validation.
|
|
1210
|
+
- **Overlay-presence enforcement** — if the validated storyboard is a single
|
|
1211
|
+
camera step, a `pulse` on the same target is auto-appended. A lone camera
|
|
1212
|
+
is physically impossible to emit.
|
|
1213
|
+
|
|
1214
|
+
Plus an optional **embedding fallback**: if the LLM request fails entirely,
|
|
1215
|
+
pass `embeddingProvider={getLocalMiniLM()}` and the package will match the
|
|
1216
|
+
chunk to the closest block via local text embeddings and emit a
|
|
1217
|
+
`block.type`-appropriate storyboard (heading → spotlight + label, paragraph →
|
|
1218
|
+
underline, caption → callout to nearest figure, etc.).
|
|
1219
|
+
|
|
1220
|
+
### Overlay hold time
|
|
1221
|
+
|
|
1222
|
+
Per-overlay `duration_ms` can be as low as 100 ms in the schema, and recipes
|
|
1223
|
+
often specify 600–1200 ms. For voice-narration UX that is too quick to
|
|
1224
|
+
register. The engine applies a **minimum hold time** floor:
|
|
1225
|
+
|
|
1226
|
+
```tsx
|
|
1227
|
+
<TutorModeContainer
|
|
1228
|
+
...
|
|
1229
|
+
minOverlayDurationMs={4000} // default 3500 ms
|
|
1230
|
+
/>
|
|
1231
|
+
```
|
|
1232
|
+
|
|
1233
|
+
A good heuristic: set it to the average spoken-chunk duration so each overlay
|
|
1234
|
+
lives as long as the narration it accompanies.
|
|
1235
|
+
|
|
1236
|
+
### The Reset view button
|
|
1237
|
+
|
|
1238
|
+
Top-right of the PDF area, the package renders a **Reset view** button by
|
|
1239
|
+
default. Clicking it clears every overlay and returns the camera to fit-page.
|
|
1240
|
+
|
|
1241
|
+
```tsx
|
|
1242
|
+
// Pure student-facing reset (recommended)
|
|
1243
|
+
<TutorModeContainer ... showExitButton />
|
|
1244
|
+
|
|
1245
|
+
// Reset + also leave tutor mode
|
|
1246
|
+
<TutorModeContainer
|
|
1247
|
+
...
|
|
1248
|
+
showExitButton
|
|
1249
|
+
onExitTutorMode={() => router.push('/library')}
|
|
1250
|
+
/>
|
|
1251
|
+
|
|
1252
|
+
// No button at all
|
|
1253
|
+
<TutorModeContainer ... showExitButton={false} />
|
|
1254
|
+
```
|
|
1255
|
+
|
|
1256
|
+
### Props
|
|
1257
|
+
|
|
1258
|
+
| Prop | Type | Default | Description |
|
|
1259
|
+
|---|---|---|---|
|
|
1260
|
+
| `pageNumber` | `number` | required | 1-indexed page to render |
|
|
1261
|
+
| `bboxData` | `PageBBoxData[]` | required | Per-page block inventory from your ingestion backend |
|
|
1262
|
+
| `narrationStore` | `NarrationStoreApi` | required | Store created via `createNarrationStore()` — one per tutor session |
|
|
1263
|
+
| `scale` | `number` | `1` | Raster scale multiplier on top of the native DPI |
|
|
1264
|
+
| `rotation` | `number` | `0` | Page rotation in degrees |
|
|
1265
|
+
| `currentChunk` | `string \| null` | `null` | The reactive text the tutor agent is currently speaking |
|
|
1266
|
+
| `llm` | `LlmConfig` | — | OpenAI-compatible endpoint config (URL, model, auth token, extra body) |
|
|
1267
|
+
| `embeddingProvider` | `EmbeddingProvider` | — | Optional local fallback when the LLM fails |
|
|
1268
|
+
| `idleTimeoutMs` | `number` | `5000` | How long with no new chunks before the camera returns to fit-page |
|
|
1269
|
+
| `llmTimeoutMs` | `number` | `30000` | Hard timeout for the LLM call |
|
|
1270
|
+
| `minOverlayDurationMs` | `number` | `3500` | Minimum visible hold for every overlay, regardless of the LLM's `duration_ms` |
|
|
1271
|
+
| `showSubtitles` | `boolean` | `false` | Render a subtitle bar with the current chunk text |
|
|
1272
|
+
| `showExitButton` | `boolean` | `true` | Render the top-right "Reset view" button |
|
|
1273
|
+
| `onExitTutorMode` | `() => void` | — | Optional callback fired AFTER the reset — use it to also leave tutor mode |
|
|
1274
|
+
| `backgroundColor` | `string` | `'#ffffff'` | Surround colour visible around the PDF when the viewport is larger than the page fit. v0.4.1+ |
|
|
1275
|
+
| `loadingComponent` | `ReactNode` | default spinner | Custom loading state shown while the PDF document/page is still fetching. v0.4.1+ |
|
|
1276
|
+
| `className` | `string` | — | Passes through to the root container for custom theming |
|
|
1277
|
+
|
|
1278
|
+
### LlmConfig
|
|
1279
|
+
|
|
1280
|
+
```ts
|
|
1281
|
+
interface LlmConfig {
|
|
1282
|
+
endpointUrl: string; // OpenAI-compatible /v1/chat/completions
|
|
1283
|
+
model: string;
|
|
1284
|
+
authToken?: string;
|
|
1285
|
+
extraBody?: Record<string, unknown>;
|
|
1286
|
+
maxTokens?: number; // default 1024
|
|
1287
|
+
temperature?: number; // default 0.3
|
|
1288
|
+
useJsonSchema?: boolean; // default true — request structured output
|
|
1289
|
+
stream?: boolean; // default false
|
|
1290
|
+
}
|
|
1291
|
+
```
|
|
1292
|
+
|
|
1293
|
+
**Never hardcode the endpoint URL into the bundle** — this package ships to
|
|
1294
|
+
multiple consumers and each owns its own inference endpoint. Pass the URL as
|
|
1295
|
+
a prop at the call site, sourced from an env var or runtime config.
|
|
1296
|
+
|
|
1297
|
+
### The integration contract in one picture
|
|
1298
|
+
|
|
1299
|
+
```
|
|
1300
|
+
Aria voice agent
|
|
1301
|
+
│ (emits sentence text as it speaks)
|
|
1302
|
+
▼
|
|
1303
|
+
<TutorModeContainer currentChunk={utterance} … />
|
|
1304
|
+
│
|
|
1305
|
+
│ debounce 200 ms
|
|
1306
|
+
▼
|
|
1307
|
+
directStoryboard(llm, chunk, bbox blocks)
|
|
1308
|
+
│ ← JSON-schema-constrained response
|
|
1309
|
+
│ ← salvage: whitespace collapse, range clamp
|
|
1310
|
+
▼
|
|
1311
|
+
StoryboardSchema.safeParse → enforceOverlayPresence
|
|
1312
|
+
│
|
|
1313
|
+
▼
|
|
1314
|
+
StoryboardEngine.execute
|
|
1315
|
+
│ ← scheduled via setTimeout per step.at_ms
|
|
1316
|
+
▼
|
|
1317
|
+
narrationStore {camera, activeOverlays}
|
|
1318
|
+
│ ← reactive
|
|
1319
|
+
▼
|
|
1320
|
+
<CameraView><PDFPage /><CinemaLayer /></CameraView>
|
|
1321
|
+
│
|
|
1322
|
+
▼
|
|
1323
|
+
Student sees the PDF react in real time
|
|
1324
|
+
```
|
|
1325
|
+
|
|
1326
|
+
---
|
|
1327
|
+
|
|
1063
1328
|
## API Reference
|
|
1064
1329
|
|
|
1065
1330
|
### PDFViewerClient Props
|