citeindex 0.12.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- citeindex-0.12.0/.claude/settings.local.json +7 -0
- citeindex-0.12.0/.gitignore +29 -0
- citeindex-0.12.0/.omc/project-memory.json +293 -0
- citeindex-0.12.0/.omc/sessions/a5782e06-815d-4db8-b4dc-fd90a82b045f.json +8 -0
- citeindex-0.12.0/.omc/sessions/c68a47d6-0746-405c-84f8-03d8c678c6c0.json +8 -0
- citeindex-0.12.0/.python-version +1 -0
- citeindex-0.12.0/PKG-INFO +221 -0
- citeindex-0.12.0/README.md +185 -0
- citeindex-0.12.0/citeindex/__init__.py +59 -0
- citeindex-0.12.0/citeindex/citation_style.py +79 -0
- citeindex-0.12.0/citeindex/cli.py +168 -0
- citeindex-0.12.0/citeindex/file_converter.py +443 -0
- citeindex-0.12.0/citeindex/ingestion/__init__.py +3 -0
- citeindex-0.12.0/citeindex/ingestion/deterministic.py +161 -0
- citeindex-0.12.0/citeindex/ingestion/markdown_export.py +351 -0
- citeindex-0.12.0/citeindex/ingestion/master.py +369 -0
- citeindex-0.12.0/citeindex/ingestion/models.py +82 -0
- citeindex-0.12.0/citeindex/ingestion/pipelines/__init__.py +12 -0
- citeindex-0.12.0/citeindex/ingestion/pipelines/common.py +315 -0
- citeindex-0.12.0/citeindex/ingestion/pipelines/digital_pdf.py +359 -0
- citeindex-0.12.0/citeindex/ingestion/pipelines/dspy_extract.py +772 -0
- citeindex-0.12.0/citeindex/ingestion/pipelines/grobid.py +403 -0
- citeindex-0.12.0/citeindex/ingestion/pipelines/layout.py +247 -0
- citeindex-0.12.0/citeindex/ingestion/pipelines/media.py +321 -0
- citeindex-0.12.0/citeindex/ingestion/pipelines/mineru.py +445 -0
- citeindex-0.12.0/citeindex/ingestion/pipelines/pageindex/__init__.py +8 -0
- citeindex-0.12.0/citeindex/ingestion/pipelines/pageindex/config.yaml +8 -0
- citeindex-0.12.0/citeindex/ingestion/pipelines/pageindex/page_index.py +1154 -0
- citeindex-0.12.0/citeindex/ingestion/pipelines/pageindex/page_index_md.py +342 -0
- citeindex-0.12.0/citeindex/ingestion/pipelines/pageindex/retrieve.py +137 -0
- citeindex-0.12.0/citeindex/ingestion/pipelines/pageindex/utils.py +710 -0
- citeindex-0.12.0/citeindex/ingestion/pipelines/pageindex_tree.py +366 -0
- citeindex-0.12.0/citeindex/ingestion/pipelines/scanned_pdf.py +104 -0
- citeindex-0.12.0/citeindex/ingestion/pipelines/url_article.py +641 -0
- citeindex-0.12.0/citeindex/ingestion/storage.py +110 -0
- citeindex-0.12.0/citeindex/ingestion/url_crawler.py +144 -0
- citeindex-0.12.0/citeindex/llm.py +60 -0
- citeindex-0.12.0/citeindex/model.py +1161 -0
- citeindex-0.12.0/citeindex/ocr_lang_detect.py +159 -0
- citeindex-0.12.0/citeindex/ocr_text_clean_before_llm.py +251 -0
- citeindex-0.12.0/citeindex/page_extractor.py +549 -0
- citeindex-0.12.0/citeindex/search.py +92 -0
- citeindex-0.12.0/citeindex/styles/chicago-author-date.csl +544 -0
- citeindex-0.12.0/citeindex/type_judge.py +119 -0
- citeindex-0.12.0/citeindex/utils.py +799 -0
- citeindex-0.12.0/citeindex/vertical_handler.py +269 -0
- citeindex-0.12.0/citeindex/vertical_llm.py +3259 -0
- citeindex-0.12.0/docs/pageindex-integration-plan.md +198 -0
- citeindex-0.12.0/docs/plans/2026-04-27-ingest-only-refactor.md +882 -0
- citeindex-0.12.0/docs/v12-runtime-migration.md +71 -0
- citeindex-0.12.0/pyproject.toml +51 -0
- citeindex-0.12.0/requirements-dev.lock +576 -0
- citeindex-0.12.0/requirements.lock +565 -0
- citeindex-0.12.0/tests/test_style.py +46 -0
- citeindex-0.12.0/uv.lock +8441 -0
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"allow": [
|
|
4
|
+
"Bash(wc -l /home/ajiap/project/citeindex/citeindex/agents/*.py /home/ajiap/project/citeindex/citeindex/ingestion/*.py /home/ajiap/project/citeindex/citeindex/ingestion/pipelines/*.py /home/ajiap/project/citeindex/citeindex/*.py)"
|
|
5
|
+
]
|
|
6
|
+
}
|
|
7
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[oc]
|
|
4
|
+
build/
|
|
5
|
+
dist/
|
|
6
|
+
wheels/
|
|
7
|
+
*.egg-info
|
|
8
|
+
.venv
|
|
9
|
+
|
|
10
|
+
# Environment
|
|
11
|
+
.env
|
|
12
|
+
|
|
13
|
+
# Corpus (user data, never committed)
|
|
14
|
+
corpus/
|
|
15
|
+
|
|
16
|
+
# Test output
|
|
17
|
+
test_output/
|
|
18
|
+
|
|
19
|
+
# Rust (removed, prevent accidents)
|
|
20
|
+
target/
|
|
21
|
+
Cargo.lock
|
|
22
|
+
*.rs.bk
|
|
23
|
+
|
|
24
|
+
# IDE / editor
|
|
25
|
+
.vscode/
|
|
26
|
+
.idea/
|
|
27
|
+
*.swp
|
|
28
|
+
*.swo
|
|
29
|
+
*~
|
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": "1.0.0",
|
|
3
|
+
"lastScanned": 1776178956720,
|
|
4
|
+
"projectRoot": "/home/ajiap/project/citeindex",
|
|
5
|
+
"techStack": {
|
|
6
|
+
"languages": [
|
|
7
|
+
{
|
|
8
|
+
"name": "Python",
|
|
9
|
+
"version": null,
|
|
10
|
+
"confidence": "high",
|
|
11
|
+
"markers": [
|
|
12
|
+
"pyproject.toml"
|
|
13
|
+
]
|
|
14
|
+
}
|
|
15
|
+
],
|
|
16
|
+
"frameworks": [
|
|
17
|
+
{
|
|
18
|
+
"name": "pytest",
|
|
19
|
+
"version": null,
|
|
20
|
+
"category": "testing"
|
|
21
|
+
}
|
|
22
|
+
],
|
|
23
|
+
"packageManager": null,
|
|
24
|
+
"runtime": null
|
|
25
|
+
},
|
|
26
|
+
"build": {
|
|
27
|
+
"buildCommand": null,
|
|
28
|
+
"testCommand": "pytest",
|
|
29
|
+
"lintCommand": "ruff check",
|
|
30
|
+
"devCommand": null,
|
|
31
|
+
"scripts": {}
|
|
32
|
+
},
|
|
33
|
+
"conventions": {
|
|
34
|
+
"namingStyle": null,
|
|
35
|
+
"importStyle": null,
|
|
36
|
+
"testPattern": null,
|
|
37
|
+
"fileOrganization": null
|
|
38
|
+
},
|
|
39
|
+
"structure": {
|
|
40
|
+
"isMonorepo": false,
|
|
41
|
+
"workspaces": [],
|
|
42
|
+
"mainDirectories": [
|
|
43
|
+
"docs",
|
|
44
|
+
"tests"
|
|
45
|
+
],
|
|
46
|
+
"gitBranches": {
|
|
47
|
+
"defaultBranch": "0.10.04",
|
|
48
|
+
"branchingStrategy": null
|
|
49
|
+
}
|
|
50
|
+
},
|
|
51
|
+
"customNotes": [],
|
|
52
|
+
"directoryMap": {
|
|
53
|
+
"__pycache__": {
|
|
54
|
+
"path": "__pycache__",
|
|
55
|
+
"purpose": null,
|
|
56
|
+
"fileCount": 2,
|
|
57
|
+
"lastAccessed": 1776178956711,
|
|
58
|
+
"keyFiles": [
|
|
59
|
+
"test_install.cpython-312-pytest-8.3.5.pyc",
|
|
60
|
+
"test_page_extraction.cpython-312-pytest-8.3.5.pyc"
|
|
61
|
+
]
|
|
62
|
+
},
|
|
63
|
+
"citeindex": {
|
|
64
|
+
"path": "citeindex",
|
|
65
|
+
"purpose": null,
|
|
66
|
+
"fileCount": 15,
|
|
67
|
+
"lastAccessed": 1776178956715,
|
|
68
|
+
"keyFiles": [
|
|
69
|
+
"__init__.py",
|
|
70
|
+
"citation_style.py",
|
|
71
|
+
"cli.py",
|
|
72
|
+
"file_converter.py",
|
|
73
|
+
"llm.py"
|
|
74
|
+
]
|
|
75
|
+
},
|
|
76
|
+
"citeindex-rs": {
|
|
77
|
+
"path": "citeindex-rs",
|
|
78
|
+
"purpose": null,
|
|
79
|
+
"fileCount": 4,
|
|
80
|
+
"lastAccessed": 1776178956715,
|
|
81
|
+
"keyFiles": [
|
|
82
|
+
"Cargo.lock",
|
|
83
|
+
"Cargo.toml",
|
|
84
|
+
"citeindex-tui.log",
|
|
85
|
+
"config.toml"
|
|
86
|
+
]
|
|
87
|
+
},
|
|
88
|
+
"corpus": {
|
|
89
|
+
"path": "corpus",
|
|
90
|
+
"purpose": null,
|
|
91
|
+
"fileCount": 2,
|
|
92
|
+
"lastAccessed": 1776178956717,
|
|
93
|
+
"keyFiles": [
|
|
94
|
+
"_url_content_hashes.json",
|
|
95
|
+
"ingestion_log.jsonl"
|
|
96
|
+
]
|
|
97
|
+
},
|
|
98
|
+
"dist": {
|
|
99
|
+
"path": "dist",
|
|
100
|
+
"purpose": "Distribution/build output",
|
|
101
|
+
"fileCount": 2,
|
|
102
|
+
"lastAccessed": 1776178956717,
|
|
103
|
+
"keyFiles": [
|
|
104
|
+
"cite_extractor-0.10.8-py3-none-any.whl",
|
|
105
|
+
"cite_extractor-0.10.8.tar.gz"
|
|
106
|
+
]
|
|
107
|
+
},
|
|
108
|
+
"docs": {
|
|
109
|
+
"path": "docs",
|
|
110
|
+
"purpose": "Documentation",
|
|
111
|
+
"fileCount": 1,
|
|
112
|
+
"lastAccessed": 1776178956717,
|
|
113
|
+
"keyFiles": [
|
|
114
|
+
"v12-runtime-migration.md"
|
|
115
|
+
]
|
|
116
|
+
},
|
|
117
|
+
"example": {
|
|
118
|
+
"path": "example",
|
|
119
|
+
"purpose": null,
|
|
120
|
+
"fileCount": 55,
|
|
121
|
+
"lastAccessed": 1776178956717,
|
|
122
|
+
"keyFiles": [
|
|
123
|
+
"2013-tarsākyā:.json",
|
|
124
|
+
"2013_庄子集释_中华书局.json",
|
|
125
|
+
"2025_心念之病_光从东方来.md.json",
|
|
126
|
+
"A_Jiatandongfangjiaohui_2025_为何保守派倾向于认为其他教派是异端_Youtube.md.json",
|
|
127
|
+
"Bai-Wengu-白文固_2003_唐代僧籍管理制度.json"
|
|
128
|
+
]
|
|
129
|
+
},
|
|
130
|
+
"instruction": {
|
|
131
|
+
"path": "instruction",
|
|
132
|
+
"purpose": null,
|
|
133
|
+
"fileCount": 3,
|
|
134
|
+
"lastAccessed": 1776178956718,
|
|
135
|
+
"keyFiles": [
|
|
136
|
+
"CiteIndex Summary v11.pdf",
|
|
137
|
+
"CiteIndex v12 Full.pdf",
|
|
138
|
+
"Summary.md"
|
|
139
|
+
]
|
|
140
|
+
},
|
|
141
|
+
"test_output": {
|
|
142
|
+
"path": "test_output",
|
|
143
|
+
"purpose": null,
|
|
144
|
+
"fileCount": 1,
|
|
145
|
+
"lastAccessed": 1776178956718,
|
|
146
|
+
"keyFiles": [
|
|
147
|
+
"Media_content_from_URL.md.json"
|
|
148
|
+
]
|
|
149
|
+
},
|
|
150
|
+
"tests": {
|
|
151
|
+
"path": "tests",
|
|
152
|
+
"purpose": "Test files",
|
|
153
|
+
"fileCount": 22,
|
|
154
|
+
"lastAccessed": 1776178956718,
|
|
155
|
+
"keyFiles": [
|
|
156
|
+
"Bai-Wengu-白文固_2003_唐代僧籍管理制度.pdf",
|
|
157
|
+
"Bai-Yudong-白玉冬-2018-丝路景教与汪古渊流.pdf",
|
|
158
|
+
"Guo-qingfan-庄子集释 (郭庆藩).pdf",
|
|
159
|
+
"README.md.backup",
|
|
160
|
+
"main.py.backup"
|
|
161
|
+
]
|
|
162
|
+
}
|
|
163
|
+
},
|
|
164
|
+
"hotPaths": [
|
|
165
|
+
{
|
|
166
|
+
"path": "README.md",
|
|
167
|
+
"accessCount": 1,
|
|
168
|
+
"lastAccessed": 1776179072424,
|
|
169
|
+
"type": "file"
|
|
170
|
+
},
|
|
171
|
+
{
|
|
172
|
+
"path": "pyproject.toml",
|
|
173
|
+
"accessCount": 1,
|
|
174
|
+
"lastAccessed": 1776179072470,
|
|
175
|
+
"type": "file"
|
|
176
|
+
},
|
|
177
|
+
{
|
|
178
|
+
"path": "citeindex/agents/__init__.py",
|
|
179
|
+
"accessCount": 1,
|
|
180
|
+
"lastAccessed": 1776179079362,
|
|
181
|
+
"type": "file"
|
|
182
|
+
},
|
|
183
|
+
{
|
|
184
|
+
"path": "citeindex/agents/v12_runtime.py",
|
|
185
|
+
"accessCount": 1,
|
|
186
|
+
"lastAccessed": 1776179096866,
|
|
187
|
+
"type": "file"
|
|
188
|
+
},
|
|
189
|
+
{
|
|
190
|
+
"path": "citeindex/cli.py",
|
|
191
|
+
"accessCount": 1,
|
|
192
|
+
"lastAccessed": 1776179096911,
|
|
193
|
+
"type": "file"
|
|
194
|
+
},
|
|
195
|
+
{
|
|
196
|
+
"path": "citeindex-rs/Cargo.toml",
|
|
197
|
+
"accessCount": 1,
|
|
198
|
+
"lastAccessed": 1776179104160,
|
|
199
|
+
"type": "file"
|
|
200
|
+
},
|
|
201
|
+
{
|
|
202
|
+
"path": "docs/v12-runtime-migration.md",
|
|
203
|
+
"accessCount": 1,
|
|
204
|
+
"lastAccessed": 1776179104203,
|
|
205
|
+
"type": "file"
|
|
206
|
+
},
|
|
207
|
+
{
|
|
208
|
+
"path": "citeindex/agents/models.py",
|
|
209
|
+
"accessCount": 1,
|
|
210
|
+
"lastAccessed": 1776179104260,
|
|
211
|
+
"type": "file"
|
|
212
|
+
},
|
|
213
|
+
{
|
|
214
|
+
"path": "citeindex/llm.py",
|
|
215
|
+
"accessCount": 1,
|
|
216
|
+
"lastAccessed": 1776180019547,
|
|
217
|
+
"type": "file"
|
|
218
|
+
},
|
|
219
|
+
{
|
|
220
|
+
"path": "citeindex/agents/chat.py",
|
|
221
|
+
"accessCount": 1,
|
|
222
|
+
"lastAccessed": 1776180019587,
|
|
223
|
+
"type": "file"
|
|
224
|
+
},
|
|
225
|
+
{
|
|
226
|
+
"path": "citeindex/agents/generation.py",
|
|
227
|
+
"accessCount": 1,
|
|
228
|
+
"lastAccessed": 1776180019616,
|
|
229
|
+
"type": "file"
|
|
230
|
+
},
|
|
231
|
+
{
|
|
232
|
+
"path": "citeindex/agents/query_planner.py",
|
|
233
|
+
"accessCount": 1,
|
|
234
|
+
"lastAccessed": 1776181697113,
|
|
235
|
+
"type": "file"
|
|
236
|
+
},
|
|
237
|
+
{
|
|
238
|
+
"path": "citeindex/agents/integrity.py",
|
|
239
|
+
"accessCount": 1,
|
|
240
|
+
"lastAccessed": 1776181697156,
|
|
241
|
+
"type": "file"
|
|
242
|
+
},
|
|
243
|
+
{
|
|
244
|
+
"path": "citeindex-rs/crates/kernel/src/agent_runtime/mod.rs",
|
|
245
|
+
"accessCount": 1,
|
|
246
|
+
"lastAccessed": 1776182005187,
|
|
247
|
+
"type": "file"
|
|
248
|
+
},
|
|
249
|
+
{
|
|
250
|
+
"path": "citeindex-rs/crates/kernel/src/tools/mod.rs",
|
|
251
|
+
"accessCount": 1,
|
|
252
|
+
"lastAccessed": 1776182005231,
|
|
253
|
+
"type": "file"
|
|
254
|
+
},
|
|
255
|
+
{
|
|
256
|
+
"path": "citeindex-rs/crates/tui/src/app.rs",
|
|
257
|
+
"accessCount": 1,
|
|
258
|
+
"lastAccessed": 1776182005267,
|
|
259
|
+
"type": "file"
|
|
260
|
+
},
|
|
261
|
+
{
|
|
262
|
+
"path": "citeindex-rs/crates/core/src/engine.rs",
|
|
263
|
+
"accessCount": 1,
|
|
264
|
+
"lastAccessed": 1776182014922,
|
|
265
|
+
"type": "file"
|
|
266
|
+
},
|
|
267
|
+
{
|
|
268
|
+
"path": "citeindex-rs/crates/core/src/config.rs",
|
|
269
|
+
"accessCount": 1,
|
|
270
|
+
"lastAccessed": 1776182014951,
|
|
271
|
+
"type": "file"
|
|
272
|
+
},
|
|
273
|
+
{
|
|
274
|
+
"path": "citeindex-rs/crates/core/src/ipc.rs",
|
|
275
|
+
"accessCount": 1,
|
|
276
|
+
"lastAccessed": 1776182056484,
|
|
277
|
+
"type": "file"
|
|
278
|
+
},
|
|
279
|
+
{
|
|
280
|
+
"path": "citeindex-rs/config.toml",
|
|
281
|
+
"accessCount": 1,
|
|
282
|
+
"lastAccessed": 1776182061838,
|
|
283
|
+
"type": "file"
|
|
284
|
+
},
|
|
285
|
+
{
|
|
286
|
+
"path": "citeindex/agents/coordinator.py",
|
|
287
|
+
"accessCount": 1,
|
|
288
|
+
"lastAccessed": 1776182061876,
|
|
289
|
+
"type": "file"
|
|
290
|
+
}
|
|
291
|
+
],
|
|
292
|
+
"userDirectives": []
|
|
293
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12.8
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: citeindex
|
|
3
|
+
Version: 0.12.0
|
|
4
|
+
Summary: Ingest sources with proper citation — PDF, URL, media, Office, DJVU
|
|
5
|
+
Author-email: ajia <yyjfwoaini@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.12
|
|
8
|
+
Requires-Dist: citeproc-py>=0.7.0
|
|
9
|
+
Requires-Dist: crawl4ai>=0.7.0
|
|
10
|
+
Requires-Dist: dspy-ai>=2.6.27
|
|
11
|
+
Requires-Dist: fasttext>=0.9.2
|
|
12
|
+
Requires-Dist: jsonschema>=4.20.0
|
|
13
|
+
Requires-Dist: litellm>=1.83.0
|
|
14
|
+
Requires-Dist: lxml>=4.9.0
|
|
15
|
+
Requires-Dist: mineru[all]>=2.6.4
|
|
16
|
+
Requires-Dist: ocrmypdf>=16.10.4
|
|
17
|
+
Requires-Dist: paddleocr>=3.1.0
|
|
18
|
+
Requires-Dist: paddlepaddle>=3.1.0
|
|
19
|
+
Requires-Dist: playwright>=1.40.0
|
|
20
|
+
Requires-Dist: pyannote-audio>=3.1.0
|
|
21
|
+
Requires-Dist: pymediainfo>=7.0.1
|
|
22
|
+
Requires-Dist: pymupdf[mupdf-third]>=1.26.3
|
|
23
|
+
Requires-Dist: pypdf2>=3.0.1
|
|
24
|
+
Requires-Dist: pypinyin>=0.51.0
|
|
25
|
+
Requires-Dist: python-dateutil>=2.8.0
|
|
26
|
+
Requires-Dist: python-dotenv>=1.1.0
|
|
27
|
+
Requires-Dist: pyyaml>=6.0.0
|
|
28
|
+
Requires-Dist: readability-lxml>=0.8.1
|
|
29
|
+
Requires-Dist: requests>=2.31.0
|
|
30
|
+
Requires-Dist: setuptools>=80.9.0
|
|
31
|
+
Requires-Dist: trafilatura>=1.6.0
|
|
32
|
+
Requires-Dist: urllib3>=2.0.0
|
|
33
|
+
Requires-Dist: whisperx>=3.1.0
|
|
34
|
+
Requires-Dist: yt-dlp>=2025.7.21
|
|
35
|
+
Description-Content-Type: text/markdown
|
|
36
|
+
|
|
37
|
+
# CiteIndex
|
|
38
|
+
|
|
39
|
+
**v0.12.0** — Ingest sources with proper citation. PDF, URL, media, Office, DJVU.
|
|
40
|
+
|
|
41
|
+
Deterministic citation extraction, Merkle-verified integrity, CJK-first OCR.
|
|
42
|
+
Every claim is traced, verified, and cited — no hallucinations.
|
|
43
|
+
|
|
44
|
+
## Install
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
# Using rye (recommended)
|
|
48
|
+
rye sync
|
|
49
|
+
|
|
50
|
+
# Or pip
|
|
51
|
+
pip install -e .
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## CLI
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
# Ingest a PDF
|
|
58
|
+
citeindex paper.pdf
|
|
59
|
+
|
|
60
|
+
# Ingest a URL
|
|
61
|
+
citeindex https://example.com/article
|
|
62
|
+
|
|
63
|
+
# Crawl and ingest all articles from a site
|
|
64
|
+
citeindex https://example.com/articles --all-url-article --crawl-depth 2
|
|
65
|
+
|
|
66
|
+
# Crawl and re-ingest only changed pages
|
|
67
|
+
citeindex https://example.com/articles --update-url-article
|
|
68
|
+
|
|
69
|
+
# Options
|
|
70
|
+
citeindex paper.pdf --llm ollama/qwen3 --type thesis --is-primary
|
|
71
|
+
citeindex paper.pdf --text-direction vertical --vertical-lang ch
|
|
72
|
+
citeindex scanned.pdf --lang auto --page-range "1-10"
|
|
73
|
+
citeindex paper.pdf --no-layout # disable column/footnote detection
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Python API
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from citeindex import ingest, IngestionConfig
|
|
80
|
+
|
|
81
|
+
# Simple
|
|
82
|
+
result = ingest("paper.pdf")
|
|
83
|
+
print(result["status"]) # "ok"
|
|
84
|
+
|
|
85
|
+
# With config
|
|
86
|
+
config = IngestionConfig(
|
|
87
|
+
llm_model="ollama/qwen3",
|
|
88
|
+
text_direction="vertical",
|
|
89
|
+
is_primary=True,
|
|
90
|
+
)
|
|
91
|
+
result = ingest("paper.pdf", corpus_root="my_corpus", config=config)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Ingestion Pipelines
|
|
95
|
+
|
|
96
|
+
CiteIndex automatically detects the input type and routes to the correct pipeline:
|
|
97
|
+
|
|
98
|
+
### Digital PDF
|
|
99
|
+
|
|
100
|
+
```
|
|
101
|
+
PDF → GROBID (metadata) → MinerU (layout) → DSPy reconciliation
|
|
102
|
+
→ document structure (pages/columns/paragraphs/lines)
|
|
103
|
+
→ Merkle tree → store to corpus/
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
- **GROBID** extracts metadata and references deterministically
|
|
107
|
+
- **MinerU** performs layout analysis (columns, footnotes, tables)
|
|
108
|
+
- **DSPy** reconciles GROBID output with pattern extraction as fallback
|
|
109
|
+
- Builds section-hierarchical document structure with actual page numbers
|
|
110
|
+
|
|
111
|
+
### Scanned PDF
|
|
112
|
+
|
|
113
|
+
```
|
|
114
|
+
PDF → OCRmyPDF (normalize) → PaddleOCR (vertical detect) → MinerU (layout)
|
|
115
|
+
→ Tesseract (text) → GROBID (citations) → document structure
|
|
116
|
+
→ Merkle tree → store to corpus/
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
- **OCRmyPDF** normalizes and adds text layer to scanned pages
|
|
120
|
+
- **PaddleOCR** detects CJK vertical text layouts
|
|
121
|
+
- **Tesseract** provides OCR with auto-detected language
|
|
122
|
+
- Supports `--text-direction vertical` for traditional Chinese/Japanese
|
|
123
|
+
|
|
124
|
+
### URL Article
|
|
125
|
+
|
|
126
|
+
```
|
|
127
|
+
URL → Playwright/requests (fetch) → trafilatura (content)
|
|
128
|
+
→ Zotero (metadata) → CSL JSON → deterministic chunking
|
|
129
|
+
→ hashes → Merkle tree → store to corpus/
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
- **Playwright** renders JavaScript-heavy pages (fallback to **requests**)
|
|
133
|
+
- **trafilatura** extracts clean text with heading structure
|
|
134
|
+
- **Zotero** extracts citation metadata (title, authors, date, DOI)
|
|
135
|
+
- Discovers in-page citation guidance (若要引用 / Cite this / etc.)
|
|
136
|
+
- Supports batch crawling with `--all-url-article` and `--update-url-article`
|
|
137
|
+
|
|
138
|
+
### Media
|
|
139
|
+
|
|
140
|
+
```
|
|
141
|
+
URL/File → yt-dlp (download) → ffmpeg (audio) → WhisperX (transcription)
|
|
142
|
+
→ pyannote (diarization, optional) → CSL JSON
|
|
143
|
+
→ chunking → hashes → Merkle tree → store to corpus/
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
- **yt-dlp** downloads from YouTube, Vimeo, podcasts, etc.
|
|
147
|
+
- **WhisperX** transcribes with word-level timestamps
|
|
148
|
+
- **pyannote** speaker diarization (optional)
|
|
149
|
+
- Supports audio (`.mp3`, `.wav`, `.m4a`) and video (`.mp4`, `.mkv`, `.webm`)
|
|
150
|
+
|
|
151
|
+
### Office & DJVU
|
|
152
|
+
|
|
153
|
+
Office documents (`.docx`, `.doc`, `.rtf`, `.odt`, `.pptx`, `.ppt`, `.odp`) and DJVU (`.djvu`) are converted to PDF via LibreOffice/ddjvu, then routed to the digital or scanned PDF pipeline.
|
|
154
|
+
|
|
155
|
+
## Configuration Reference
|
|
156
|
+
|
|
157
|
+
| Option | CLI Flag | Default | Description |
|
|
158
|
+
|--------|----------|---------|-------------|
|
|
159
|
+
| `llm_model` | `--llm` | `ollama/qwen3` | LLM model for citation extraction |
|
|
160
|
+
| `text_direction` | `--text-direction`, `-td` | `horizontal` | `horizontal`, `auto`, or `vertical` |
|
|
161
|
+
| `vertical_lang` | `--vertical-lang` | `ch` | CJK language: `ch` (Chinese) or `japan` |
|
|
162
|
+
| `lang` | `--lang`, `-l` | `auto` | OCR language (auto-detect or Tesseract code) |
|
|
163
|
+
| `page_range` | `--page-range`, `-p` | `1-5, -3` | Pages to extract (e.g. `"1-10"`, `"1-5, -3"`) |
|
|
164
|
+
| `doc_type_override` | `--type`, `-t` | auto | `book`, `thesis`, `journal`, or `bookchapter` |
|
|
165
|
+
| `use_layout_analysis` | `--no-layout` | `True` | Disable column/footnote detection |
|
|
166
|
+
| `is_primary` | `--is-primary` | `False` | Line-level granularity (vs paragraph-level) |
|
|
167
|
+
| `use_pageindex` | `--use-pageindex` | `False` | LLM-driven section hierarchy (requires Ollama) |
|
|
168
|
+
| `pageindex_model` | `--pageindex-model` | `ollama/qwen3.5:cloud` | LLM for PageIndex tree building |
|
|
169
|
+
| `citation_style` | (API only) | `chicago-author-date` | CSL citation style for output |
|
|
170
|
+
| `corpus_root` | `--corpus-root` | `corpus` | Output directory for ingested artifacts |
|
|
171
|
+
| `schema_version` | `--schema-version` | `1.0.0` | Output schema version tag |
|
|
172
|
+
|
|
173
|
+
## Output
|
|
174
|
+
|
|
175
|
+
Each ingestion produces a corpus folder (e.g., `corpus/Author_2024_Title/`) containing:
|
|
176
|
+
|
|
177
|
+
| File | Description |
|
|
178
|
+
|------|-------------|
|
|
179
|
+
| `csl.json` | Citation metadata (CSL-JSON with `ci_*` extensions: `content_hash`, `merkle_root`, `source_type`, `ingestion_timestamp`) |
|
|
180
|
+
| `document.json` | Structured document tree (PageIndex) — sections, pages, paragraphs, lines |
|
|
181
|
+
| `merkle.json` | SHA-256 Merkle tree for integrity verification |
|
|
182
|
+
| `ingestion_output.json` | Full ingestion result with all pipeline outputs |
|
|
183
|
+
| `library.md` | Human-readable citation with extracted text and footnotes |
|
|
184
|
+
|
|
185
|
+
### Return Value
|
|
186
|
+
|
|
187
|
+
The `ingest()` function returns a dict:
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
{
|
|
191
|
+
"status": "ok", # "ok" or "blocked"
|
|
192
|
+
"document_path": "corpus/Author_2024_Title",
|
|
193
|
+
"standardized_csl_json": { ... }, # Full CSL-JSON with ci_ extensions
|
|
194
|
+
"sub_pipeline_outputs": { ... }, # Raw pipeline results
|
|
195
|
+
"ingestion_log_entry": { ... }, # Log entry with merkle_root
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
# On failure:
|
|
199
|
+
{
|
|
200
|
+
"status": "blocked",
|
|
201
|
+
"stage": "detect_resource_type",
|
|
202
|
+
"error_code": "unsupported_input",
|
|
203
|
+
"error_message": "Unsupported input: ...",
|
|
204
|
+
"next_action": "Provide PDF, URL, or media file",
|
|
205
|
+
}
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
## Supported Formats
|
|
209
|
+
|
|
210
|
+
| Format | Extension / Protocol |
|
|
211
|
+
|--------|----------------------|
|
|
212
|
+
| Digital PDF | `.pdf` (with embedded text) |
|
|
213
|
+
| Scanned PDF | `.pdf` (image-based, OCR applied) |
|
|
214
|
+
| URL Article | `http://` / `https://` |
|
|
215
|
+
| Media | `.mp3`, `.wav`, `.m4a`, `.mp4`, `.mkv`, `.webm` |
|
|
216
|
+
| Office | `.docx`, `.doc`, `.rtf`, `.odt`, `.pptx`, `.ppt`, `.odp` |
|
|
217
|
+
| DJVU | `.djvu` |
|
|
218
|
+
|
|
219
|
+
## License
|
|
220
|
+
|
|
221
|
+
MIT
|