citeindex 0.12.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. citeindex-0.12.0/.claude/settings.local.json +7 -0
  2. citeindex-0.12.0/.gitignore +29 -0
  3. citeindex-0.12.0/.omc/project-memory.json +293 -0
  4. citeindex-0.12.0/.omc/sessions/a5782e06-815d-4db8-b4dc-fd90a82b045f.json +8 -0
  5. citeindex-0.12.0/.omc/sessions/c68a47d6-0746-405c-84f8-03d8c678c6c0.json +8 -0
  6. citeindex-0.12.0/.python-version +1 -0
  7. citeindex-0.12.0/PKG-INFO +221 -0
  8. citeindex-0.12.0/README.md +185 -0
  9. citeindex-0.12.0/citeindex/__init__.py +59 -0
  10. citeindex-0.12.0/citeindex/citation_style.py +79 -0
  11. citeindex-0.12.0/citeindex/cli.py +168 -0
  12. citeindex-0.12.0/citeindex/file_converter.py +443 -0
  13. citeindex-0.12.0/citeindex/ingestion/__init__.py +3 -0
  14. citeindex-0.12.0/citeindex/ingestion/deterministic.py +161 -0
  15. citeindex-0.12.0/citeindex/ingestion/markdown_export.py +351 -0
  16. citeindex-0.12.0/citeindex/ingestion/master.py +369 -0
  17. citeindex-0.12.0/citeindex/ingestion/models.py +82 -0
  18. citeindex-0.12.0/citeindex/ingestion/pipelines/__init__.py +12 -0
  19. citeindex-0.12.0/citeindex/ingestion/pipelines/common.py +315 -0
  20. citeindex-0.12.0/citeindex/ingestion/pipelines/digital_pdf.py +359 -0
  21. citeindex-0.12.0/citeindex/ingestion/pipelines/dspy_extract.py +772 -0
  22. citeindex-0.12.0/citeindex/ingestion/pipelines/grobid.py +403 -0
  23. citeindex-0.12.0/citeindex/ingestion/pipelines/layout.py +247 -0
  24. citeindex-0.12.0/citeindex/ingestion/pipelines/media.py +321 -0
  25. citeindex-0.12.0/citeindex/ingestion/pipelines/mineru.py +445 -0
  26. citeindex-0.12.0/citeindex/ingestion/pipelines/pageindex/__init__.py +8 -0
  27. citeindex-0.12.0/citeindex/ingestion/pipelines/pageindex/config.yaml +8 -0
  28. citeindex-0.12.0/citeindex/ingestion/pipelines/pageindex/page_index.py +1154 -0
  29. citeindex-0.12.0/citeindex/ingestion/pipelines/pageindex/page_index_md.py +342 -0
  30. citeindex-0.12.0/citeindex/ingestion/pipelines/pageindex/retrieve.py +137 -0
  31. citeindex-0.12.0/citeindex/ingestion/pipelines/pageindex/utils.py +710 -0
  32. citeindex-0.12.0/citeindex/ingestion/pipelines/pageindex_tree.py +366 -0
  33. citeindex-0.12.0/citeindex/ingestion/pipelines/scanned_pdf.py +104 -0
  34. citeindex-0.12.0/citeindex/ingestion/pipelines/url_article.py +641 -0
  35. citeindex-0.12.0/citeindex/ingestion/storage.py +110 -0
  36. citeindex-0.12.0/citeindex/ingestion/url_crawler.py +144 -0
  37. citeindex-0.12.0/citeindex/llm.py +60 -0
  38. citeindex-0.12.0/citeindex/model.py +1161 -0
  39. citeindex-0.12.0/citeindex/ocr_lang_detect.py +159 -0
  40. citeindex-0.12.0/citeindex/ocr_text_clean_before_llm.py +251 -0
  41. citeindex-0.12.0/citeindex/page_extractor.py +549 -0
  42. citeindex-0.12.0/citeindex/search.py +92 -0
  43. citeindex-0.12.0/citeindex/styles/chicago-author-date.csl +544 -0
  44. citeindex-0.12.0/citeindex/type_judge.py +119 -0
  45. citeindex-0.12.0/citeindex/utils.py +799 -0
  46. citeindex-0.12.0/citeindex/vertical_handler.py +269 -0
  47. citeindex-0.12.0/citeindex/vertical_llm.py +3259 -0
  48. citeindex-0.12.0/docs/pageindex-integration-plan.md +198 -0
  49. citeindex-0.12.0/docs/plans/2026-04-27-ingest-only-refactor.md +882 -0
  50. citeindex-0.12.0/docs/v12-runtime-migration.md +71 -0
  51. citeindex-0.12.0/pyproject.toml +51 -0
  52. citeindex-0.12.0/requirements-dev.lock +576 -0
  53. citeindex-0.12.0/requirements.lock +565 -0
  54. citeindex-0.12.0/tests/test_style.py +46 -0
  55. citeindex-0.12.0/uv.lock +8441 -0
@@ -0,0 +1,7 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(wc -l /home/ajiap/project/citeindex/citeindex/agents/*.py /home/ajiap/project/citeindex/citeindex/ingestion/*.py /home/ajiap/project/citeindex/citeindex/ingestion/pipelines/*.py /home/ajiap/project/citeindex/citeindex/*.py)"
5
+ ]
6
+ }
7
+ }
@@ -0,0 +1,29 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+ .venv
9
+
10
+ # Environment
11
+ .env
12
+
13
+ # Corpus (user data, never committed)
14
+ corpus/
15
+
16
+ # Test output
17
+ test_output/
18
+
19
+ # Rust (removed, prevent accidents)
20
+ target/
21
+ Cargo.lock
22
+ *.rs.bk
23
+
24
+ # IDE / editor
25
+ .vscode/
26
+ .idea/
27
+ *.swp
28
+ *.swo
29
+ *~
@@ -0,0 +1,293 @@
1
+ {
2
+ "version": "1.0.0",
3
+ "lastScanned": 1776178956720,
4
+ "projectRoot": "/home/ajiap/project/citeindex",
5
+ "techStack": {
6
+ "languages": [
7
+ {
8
+ "name": "Python",
9
+ "version": null,
10
+ "confidence": "high",
11
+ "markers": [
12
+ "pyproject.toml"
13
+ ]
14
+ }
15
+ ],
16
+ "frameworks": [
17
+ {
18
+ "name": "pytest",
19
+ "version": null,
20
+ "category": "testing"
21
+ }
22
+ ],
23
+ "packageManager": null,
24
+ "runtime": null
25
+ },
26
+ "build": {
27
+ "buildCommand": null,
28
+ "testCommand": "pytest",
29
+ "lintCommand": "ruff check",
30
+ "devCommand": null,
31
+ "scripts": {}
32
+ },
33
+ "conventions": {
34
+ "namingStyle": null,
35
+ "importStyle": null,
36
+ "testPattern": null,
37
+ "fileOrganization": null
38
+ },
39
+ "structure": {
40
+ "isMonorepo": false,
41
+ "workspaces": [],
42
+ "mainDirectories": [
43
+ "docs",
44
+ "tests"
45
+ ],
46
+ "gitBranches": {
47
+ "defaultBranch": "0.10.04",
48
+ "branchingStrategy": null
49
+ }
50
+ },
51
+ "customNotes": [],
52
+ "directoryMap": {
53
+ "__pycache__": {
54
+ "path": "__pycache__",
55
+ "purpose": null,
56
+ "fileCount": 2,
57
+ "lastAccessed": 1776178956711,
58
+ "keyFiles": [
59
+ "test_install.cpython-312-pytest-8.3.5.pyc",
60
+ "test_page_extraction.cpython-312-pytest-8.3.5.pyc"
61
+ ]
62
+ },
63
+ "citeindex": {
64
+ "path": "citeindex",
65
+ "purpose": null,
66
+ "fileCount": 15,
67
+ "lastAccessed": 1776178956715,
68
+ "keyFiles": [
69
+ "__init__.py",
70
+ "citation_style.py",
71
+ "cli.py",
72
+ "file_converter.py",
73
+ "llm.py"
74
+ ]
75
+ },
76
+ "citeindex-rs": {
77
+ "path": "citeindex-rs",
78
+ "purpose": null,
79
+ "fileCount": 4,
80
+ "lastAccessed": 1776178956715,
81
+ "keyFiles": [
82
+ "Cargo.lock",
83
+ "Cargo.toml",
84
+ "citeindex-tui.log",
85
+ "config.toml"
86
+ ]
87
+ },
88
+ "corpus": {
89
+ "path": "corpus",
90
+ "purpose": null,
91
+ "fileCount": 2,
92
+ "lastAccessed": 1776178956717,
93
+ "keyFiles": [
94
+ "_url_content_hashes.json",
95
+ "ingestion_log.jsonl"
96
+ ]
97
+ },
98
+ "dist": {
99
+ "path": "dist",
100
+ "purpose": "Distribution/build output",
101
+ "fileCount": 2,
102
+ "lastAccessed": 1776178956717,
103
+ "keyFiles": [
104
+ "cite_extractor-0.10.8-py3-none-any.whl",
105
+ "cite_extractor-0.10.8.tar.gz"
106
+ ]
107
+ },
108
+ "docs": {
109
+ "path": "docs",
110
+ "purpose": "Documentation",
111
+ "fileCount": 1,
112
+ "lastAccessed": 1776178956717,
113
+ "keyFiles": [
114
+ "v12-runtime-migration.md"
115
+ ]
116
+ },
117
+ "example": {
118
+ "path": "example",
119
+ "purpose": null,
120
+ "fileCount": 55,
121
+ "lastAccessed": 1776178956717,
122
+ "keyFiles": [
123
+ "2013-tarsākyā:.json",
124
+ "2013_庄子集释_中华书局.json",
125
+ "2025_心念之病_光从东方来.md.json",
126
+ "A_Jiatandongfangjiaohui_2025_为何保守派倾向于认为其他教派是异端_Youtube.md.json",
127
+ "Bai-Wengu-白文固_2003_唐代僧籍管理制度.json"
128
+ ]
129
+ },
130
+ "instruction": {
131
+ "path": "instruction",
132
+ "purpose": null,
133
+ "fileCount": 3,
134
+ "lastAccessed": 1776178956718,
135
+ "keyFiles": [
136
+ "CiteIndex Summary v11.pdf",
137
+ "CiteIndex v12 Full.pdf",
138
+ "Summary.md"
139
+ ]
140
+ },
141
+ "test_output": {
142
+ "path": "test_output",
143
+ "purpose": null,
144
+ "fileCount": 1,
145
+ "lastAccessed": 1776178956718,
146
+ "keyFiles": [
147
+ "Media_content_from_URL.md.json"
148
+ ]
149
+ },
150
+ "tests": {
151
+ "path": "tests",
152
+ "purpose": "Test files",
153
+ "fileCount": 22,
154
+ "lastAccessed": 1776178956718,
155
+ "keyFiles": [
156
+ "Bai-Wengu-白文固_2003_唐代僧籍管理制度.pdf",
157
+ "Bai-Yudong-白玉冬-2018-丝路景教与汪古渊流.pdf",
158
+ "Guo-qingfan-庄子集释 (郭庆藩).pdf",
159
+ "README.md.backup",
160
+ "main.py.backup"
161
+ ]
162
+ }
163
+ },
164
+ "hotPaths": [
165
+ {
166
+ "path": "README.md",
167
+ "accessCount": 1,
168
+ "lastAccessed": 1776179072424,
169
+ "type": "file"
170
+ },
171
+ {
172
+ "path": "pyproject.toml",
173
+ "accessCount": 1,
174
+ "lastAccessed": 1776179072470,
175
+ "type": "file"
176
+ },
177
+ {
178
+ "path": "citeindex/agents/__init__.py",
179
+ "accessCount": 1,
180
+ "lastAccessed": 1776179079362,
181
+ "type": "file"
182
+ },
183
+ {
184
+ "path": "citeindex/agents/v12_runtime.py",
185
+ "accessCount": 1,
186
+ "lastAccessed": 1776179096866,
187
+ "type": "file"
188
+ },
189
+ {
190
+ "path": "citeindex/cli.py",
191
+ "accessCount": 1,
192
+ "lastAccessed": 1776179096911,
193
+ "type": "file"
194
+ },
195
+ {
196
+ "path": "citeindex-rs/Cargo.toml",
197
+ "accessCount": 1,
198
+ "lastAccessed": 1776179104160,
199
+ "type": "file"
200
+ },
201
+ {
202
+ "path": "docs/v12-runtime-migration.md",
203
+ "accessCount": 1,
204
+ "lastAccessed": 1776179104203,
205
+ "type": "file"
206
+ },
207
+ {
208
+ "path": "citeindex/agents/models.py",
209
+ "accessCount": 1,
210
+ "lastAccessed": 1776179104260,
211
+ "type": "file"
212
+ },
213
+ {
214
+ "path": "citeindex/llm.py",
215
+ "accessCount": 1,
216
+ "lastAccessed": 1776180019547,
217
+ "type": "file"
218
+ },
219
+ {
220
+ "path": "citeindex/agents/chat.py",
221
+ "accessCount": 1,
222
+ "lastAccessed": 1776180019587,
223
+ "type": "file"
224
+ },
225
+ {
226
+ "path": "citeindex/agents/generation.py",
227
+ "accessCount": 1,
228
+ "lastAccessed": 1776180019616,
229
+ "type": "file"
230
+ },
231
+ {
232
+ "path": "citeindex/agents/query_planner.py",
233
+ "accessCount": 1,
234
+ "lastAccessed": 1776181697113,
235
+ "type": "file"
236
+ },
237
+ {
238
+ "path": "citeindex/agents/integrity.py",
239
+ "accessCount": 1,
240
+ "lastAccessed": 1776181697156,
241
+ "type": "file"
242
+ },
243
+ {
244
+ "path": "citeindex-rs/crates/kernel/src/agent_runtime/mod.rs",
245
+ "accessCount": 1,
246
+ "lastAccessed": 1776182005187,
247
+ "type": "file"
248
+ },
249
+ {
250
+ "path": "citeindex-rs/crates/kernel/src/tools/mod.rs",
251
+ "accessCount": 1,
252
+ "lastAccessed": 1776182005231,
253
+ "type": "file"
254
+ },
255
+ {
256
+ "path": "citeindex-rs/crates/tui/src/app.rs",
257
+ "accessCount": 1,
258
+ "lastAccessed": 1776182005267,
259
+ "type": "file"
260
+ },
261
+ {
262
+ "path": "citeindex-rs/crates/core/src/engine.rs",
263
+ "accessCount": 1,
264
+ "lastAccessed": 1776182014922,
265
+ "type": "file"
266
+ },
267
+ {
268
+ "path": "citeindex-rs/crates/core/src/config.rs",
269
+ "accessCount": 1,
270
+ "lastAccessed": 1776182014951,
271
+ "type": "file"
272
+ },
273
+ {
274
+ "path": "citeindex-rs/crates/core/src/ipc.rs",
275
+ "accessCount": 1,
276
+ "lastAccessed": 1776182056484,
277
+ "type": "file"
278
+ },
279
+ {
280
+ "path": "citeindex-rs/config.toml",
281
+ "accessCount": 1,
282
+ "lastAccessed": 1776182061838,
283
+ "type": "file"
284
+ },
285
+ {
286
+ "path": "citeindex/agents/coordinator.py",
287
+ "accessCount": 1,
288
+ "lastAccessed": 1776182061876,
289
+ "type": "file"
290
+ }
291
+ ],
292
+ "userDirectives": []
293
+ }
@@ -0,0 +1,8 @@
1
+ {
2
+ "session_id": "a5782e06-815d-4db8-b4dc-fd90a82b045f",
3
+ "ended_at": "2026-04-14T20:02:26.698Z",
4
+ "reason": "prompt_input_exit",
5
+ "agents_spawned": 0,
6
+ "agents_completed": 0,
7
+ "modes_used": []
8
+ }
@@ -0,0 +1,8 @@
1
+ {
2
+ "session_id": "c68a47d6-0746-405c-84f8-03d8c678c6c0",
3
+ "ended_at": "2026-04-14T15:03:12.144Z",
4
+ "reason": "other",
5
+ "agents_spawned": 0,
6
+ "agents_completed": 0,
7
+ "modes_used": []
8
+ }
@@ -0,0 +1 @@
1
+ 3.12.8
@@ -0,0 +1,221 @@
1
+ Metadata-Version: 2.3
2
+ Name: citeindex
3
+ Version: 0.12.0
4
+ Summary: Ingest sources with proper citation — PDF, URL, media, Office, DJVU
5
+ Author-email: ajia <yyjfwoaini@gmail.com>
6
+ License: MIT
7
+ Requires-Python: >=3.12
8
+ Requires-Dist: citeproc-py>=0.7.0
9
+ Requires-Dist: crawl4ai>=0.7.0
10
+ Requires-Dist: dspy-ai>=2.6.27
11
+ Requires-Dist: fasttext>=0.9.2
12
+ Requires-Dist: jsonschema>=4.20.0
13
+ Requires-Dist: litellm>=1.83.0
14
+ Requires-Dist: lxml>=4.9.0
15
+ Requires-Dist: mineru[all]>=2.6.4
16
+ Requires-Dist: ocrmypdf>=16.10.4
17
+ Requires-Dist: paddleocr>=3.1.0
18
+ Requires-Dist: paddlepaddle>=3.1.0
19
+ Requires-Dist: playwright>=1.40.0
20
+ Requires-Dist: pyannote-audio>=3.1.0
21
+ Requires-Dist: pymediainfo>=7.0.1
22
+ Requires-Dist: pymupdf[mupdf-third]>=1.26.3
23
+ Requires-Dist: pypdf2>=3.0.1
24
+ Requires-Dist: pypinyin>=0.51.0
25
+ Requires-Dist: python-dateutil>=2.8.0
26
+ Requires-Dist: python-dotenv>=1.1.0
27
+ Requires-Dist: pyyaml>=6.0.0
28
+ Requires-Dist: readability-lxml>=0.8.1
29
+ Requires-Dist: requests>=2.31.0
30
+ Requires-Dist: setuptools>=80.9.0
31
+ Requires-Dist: trafilatura>=1.6.0
32
+ Requires-Dist: urllib3>=2.0.0
33
+ Requires-Dist: whisperx>=3.1.0
34
+ Requires-Dist: yt-dlp>=2025.7.21
35
+ Description-Content-Type: text/markdown
36
+
37
+ # CiteIndex
38
+
39
+ **v0.12.0** — Ingest sources with proper citation. PDF, URL, media, Office, DJVU.
40
+
41
+ Deterministic citation extraction, Merkle-verified integrity, CJK-first OCR.
42
+ Every claim is traced, verified, and cited — no hallucinations.
43
+
44
+ ## Install
45
+
46
+ ```bash
47
+ # Using rye (recommended)
48
+ rye sync
49
+
50
+ # Or pip
51
+ pip install -e .
52
+ ```
53
+
54
+ ## CLI
55
+
56
+ ```bash
57
+ # Ingest a PDF
58
+ citeindex paper.pdf
59
+
60
+ # Ingest a URL
61
+ citeindex https://example.com/article
62
+
63
+ # Crawl and ingest all articles from a site
64
+ citeindex https://example.com/articles --all-url-article --crawl-depth 2
65
+
66
+ # Crawl and re-ingest only changed pages
67
+ citeindex https://example.com/articles --update-url-article
68
+
69
+ # Options
70
+ citeindex paper.pdf --llm ollama/qwen3 --type thesis --is-primary
71
+ citeindex paper.pdf --text-direction vertical --vertical-lang ch
72
+ citeindex scanned.pdf --lang auto --page-range "1-10"
73
+ citeindex paper.pdf --no-layout # disable column/footnote detection
74
+ ```
75
+
76
+ ## Python API
77
+
78
+ ```python
79
+ from citeindex import ingest, IngestionConfig
80
+
81
+ # Simple
82
+ result = ingest("paper.pdf")
83
+ print(result["status"]) # "ok"
84
+
85
+ # With config
86
+ config = IngestionConfig(
87
+ llm_model="ollama/qwen3",
88
+ text_direction="vertical",
89
+ is_primary=True,
90
+ )
91
+ result = ingest("paper.pdf", corpus_root="my_corpus", config=config)
92
+ ```
93
+
94
+ ## Ingestion Pipelines
95
+
96
+ CiteIndex automatically detects the input type and routes to the correct pipeline:
97
+
98
+ ### Digital PDF
99
+
100
+ ```
101
+ PDF → GROBID (metadata) → MinerU (layout) → DSPy reconciliation
102
+ → document structure (pages/columns/paragraphs/lines)
103
+ → Merkle tree → store to corpus/
104
+ ```
105
+
106
+ - **GROBID** extracts metadata and references deterministically
107
+ - **MinerU** performs layout analysis (columns, footnotes, tables)
108
+ - **DSPy** reconciles GROBID output with pattern extraction as fallback
109
+ - Builds section-hierarchical document structure with actual page numbers
110
+
111
+ ### Scanned PDF
112
+
113
+ ```
114
+ PDF → OCRmyPDF (normalize) → PaddleOCR (vertical detect) → MinerU (layout)
115
+ → Tesseract (text) → GROBID (citations) → document structure
116
+ → Merkle tree → store to corpus/
117
+ ```
118
+
119
+ - **OCRmyPDF** normalizes and adds text layer to scanned pages
120
+ - **PaddleOCR** detects CJK vertical text layouts
121
+ - **Tesseract** provides OCR with auto-detected language
122
+ - Supports `--text-direction vertical` for traditional Chinese/Japanese
123
+
124
+ ### URL Article
125
+
126
+ ```
127
+ URL → Playwright/requests (fetch) → trafilatura (content)
128
+ → Zotero (metadata) → CSL JSON → deterministic chunking
129
+ → hashes → Merkle tree → store to corpus/
130
+ ```
131
+
132
+ - **Playwright** renders JavaScript-heavy pages (fallback to **requests**)
133
+ - **trafilatura** extracts clean text with heading structure
134
+ - **Zotero** extracts citation metadata (title, authors, date, DOI)
135
+ - Discovers in-page citation guidance (若要引用 / Cite this / etc.)
136
+ - Supports batch crawling with `--all-url-article` and `--update-url-article`
137
+
138
+ ### Media
139
+
140
+ ```
141
+ URL/File → yt-dlp (download) → ffmpeg (audio) → WhisperX (transcription)
142
+ → pyannote (diarization, optional) → CSL JSON
143
+ → chunking → hashes → Merkle tree → store to corpus/
144
+ ```
145
+
146
+ - **yt-dlp** downloads from YouTube, Vimeo, podcasts, etc.
147
+ - **WhisperX** transcribes with word-level timestamps
148
+ - **pyannote** speaker diarization (optional)
149
+ - Supports audio (`.mp3`, `.wav`, `.m4a`) and video (`.mp4`, `.mkv`, `.webm`)
150
+
151
+ ### Office & DJVU
152
+
153
+ Office documents (`.docx`, `.doc`, `.rtf`, `.odt`, `.pptx`, `.ppt`, `.odp`) and DJVU (`.djvu`) are converted to PDF via LibreOffice/ddjvu, then routed to the digital or scanned PDF pipeline.
154
+
155
+ ## Configuration Reference
156
+
157
+ | Option | CLI Flag | Default | Description |
158
+ |--------|----------|---------|-------------|
159
+ | `llm_model` | `--llm` | `ollama/qwen3` | LLM model for citation extraction |
160
+ | `text_direction` | `--text-direction`, `-td` | `horizontal` | `horizontal`, `auto`, or `vertical` |
161
+ | `vertical_lang` | `--vertical-lang` | `ch` | CJK language: `ch` (Chinese) or `japan` |
162
+ | `lang` | `--lang`, `-l` | `auto` | OCR language (auto-detect or Tesseract code) |
163
+ | `page_range` | `--page-range`, `-p` | `1-5, -3` | Pages to extract (e.g. `"1-10"`, `"1-5, -3"`) |
164
+ | `doc_type_override` | `--type`, `-t` | auto | `book`, `thesis`, `journal`, or `bookchapter` |
165
+ | `use_layout_analysis` | `--no-layout` | `True` | Disable column/footnote detection |
166
+ | `is_primary` | `--is-primary` | `False` | Line-level granularity (vs paragraph-level) |
167
+ | `use_pageindex` | `--use-pageindex` | `False` | LLM-driven section hierarchy (requires Ollama) |
168
+ | `pageindex_model` | `--pageindex-model` | `ollama/qwen3.5:cloud` | LLM for PageIndex tree building |
169
+ | `citation_style` | (API only) | `chicago-author-date` | CSL citation style for output |
170
+ | `corpus_root` | `--corpus-root` | `corpus` | Output directory for ingested artifacts |
171
+ | `schema_version` | `--schema-version` | `1.0.0` | Output schema version tag |
172
+
173
+ ## Output
174
+
175
+ Each ingestion produces a corpus folder (e.g., `corpus/Author_2024_Title/`) containing:
176
+
177
+ | File | Description |
178
+ |------|-------------|
179
+ | `csl.json` | Citation metadata (CSL-JSON with `ci_*` extensions: `content_hash`, `merkle_root`, `source_type`, `ingestion_timestamp`) |
180
+ | `document.json` | Structured document tree (PageIndex) — sections, pages, paragraphs, lines |
181
+ | `merkle.json` | SHA-256 Merkle tree for integrity verification |
182
+ | `ingestion_output.json` | Full ingestion result with all pipeline outputs |
183
+ | `library.md` | Human-readable citation with extracted text and footnotes |
184
+
185
+ ### Return Value
186
+
187
+ The `ingest()` function returns a dict:
188
+
189
+ ```python
190
+ {
191
+ "status": "ok", # "ok" or "blocked"
192
+ "document_path": "corpus/Author_2024_Title",
193
+ "standardized_csl_json": { ... }, # Full CSL-JSON with ci_ extensions
194
+ "sub_pipeline_outputs": { ... }, # Raw pipeline results
195
+ "ingestion_log_entry": { ... }, # Log entry with merkle_root
196
+ }
197
+
198
+ # On failure:
199
+ {
200
+ "status": "blocked",
201
+ "stage": "detect_resource_type",
202
+ "error_code": "unsupported_input",
203
+ "error_message": "Unsupported input: ...",
204
+ "next_action": "Provide PDF, URL, or media file",
205
+ }
206
+ ```
207
+
208
+ ## Supported Formats
209
+
210
+ | Format | Extension / Protocol |
211
+ |--------|----------------------|
212
+ | Digital PDF | `.pdf` (with embedded text) |
213
+ | Scanned PDF | `.pdf` (image-based, OCR applied) |
214
+ | URL Article | `http://` / `https://` |
215
+ | Media | `.mp3`, `.wav`, `.m4a`, `.mp4`, `.mkv`, `.webm` |
216
+ | Office | `.docx`, `.doc`, `.rtf`, `.odt`, `.pptx`, `.ppt`, `.odp` |
217
+ | DJVU | `.djvu` |
218
+
219
+ ## License
220
+
221
+ MIT