querdex 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. querdex-0.1.0/.env.example +10 -0
  2. querdex-0.1.0/.github/workflows/ci.yml +25 -0
  3. querdex-0.1.0/.github/workflows/publish.yml +48 -0
  4. querdex-0.1.0/.gitignore +35 -0
  5. querdex-0.1.0/Makefile +13 -0
  6. querdex-0.1.0/PKG-INFO +384 -0
  7. querdex-0.1.0/README.md +352 -0
  8. querdex-0.1.0/infra/Dockerfile +14 -0
  9. querdex-0.1.0/infra/ENVIRONMENTS.md +20 -0
  10. querdex-0.1.0/infra/README.md +3 -0
  11. querdex-0.1.0/infra/docker-compose.yml +14 -0
  12. querdex-0.1.0/pyproject.toml +71 -0
  13. querdex-0.1.0/scripts/dev_index_and_query.sh +9 -0
  14. querdex-0.1.0/scripts/run_release_gate.py +67 -0
  15. querdex-0.1.0/src/querdex/__init__.py +5 -0
  16. querdex-0.1.0/src/querdex/adaptive/__init__.py +3 -0
  17. querdex-0.1.0/src/querdex/adaptive/updater.py +143 -0
  18. querdex-0.1.0/src/querdex/cli.py +60 -0
  19. querdex-0.1.0/src/querdex/evaluation/__init__.py +12 -0
  20. querdex-0.1.0/src/querdex/evaluation/harness.py +55 -0
  21. querdex-0.1.0/src/querdex/evaluation/metrics.py +79 -0
  22. querdex-0.1.0/src/querdex/indexing/__init__.py +28 -0
  23. querdex-0.1.0/src/querdex/indexing/coordinator.py +79 -0
  24. querdex-0.1.0/src/querdex/indexing/diff_reindex.py +108 -0
  25. querdex-0.1.0/src/querdex/indexing/entity_extractor.py +84 -0
  26. querdex-0.1.0/src/querdex/indexing/entity_map_updater.py +42 -0
  27. querdex-0.1.0/src/querdex/indexing/graph_builder.py +168 -0
  28. querdex-0.1.0/src/querdex/indexing/quality.py +56 -0
  29. querdex-0.1.0/src/querdex/indexing/tree_builder.py +242 -0
  30. querdex-0.1.0/src/querdex/ingestion/__init__.py +3 -0
  31. querdex-0.1.0/src/querdex/ingestion/base.py +12 -0
  32. querdex-0.1.0/src/querdex/ingestion/orchestrator.py +85 -0
  33. querdex-0.1.0/src/querdex/ingestion/parsers/__init__.py +28 -0
  34. querdex-0.1.0/src/querdex/ingestion/parsers/audio_video_parser.py +65 -0
  35. querdex-0.1.0/src/querdex/ingestion/parsers/code_parser.py +100 -0
  36. querdex-0.1.0/src/querdex/ingestion/parsers/csv_parser.py +71 -0
  37. querdex-0.1.0/src/querdex/ingestion/parsers/docx_parser.py +63 -0
  38. querdex-0.1.0/src/querdex/ingestion/parsers/html_parser.py +43 -0
  39. querdex-0.1.0/src/querdex/ingestion/parsers/markdown_parser.py +65 -0
  40. querdex-0.1.0/src/querdex/ingestion/parsers/ocr.py +80 -0
  41. querdex-0.1.0/src/querdex/ingestion/parsers/pdf_parser.py +188 -0
  42. querdex-0.1.0/src/querdex/ingestion/parsers/sqlite_parser.py +58 -0
  43. querdex-0.1.0/src/querdex/ingestion/parsers/text_parser.py +29 -0
  44. querdex-0.1.0/src/querdex/ingestion/parsers/url_parser.py +70 -0
  45. querdex-0.1.0/src/querdex/llm/__init__.py +4 -0
  46. querdex-0.1.0/src/querdex/llm/anthropic_client.py +47 -0
  47. querdex-0.1.0/src/querdex/llm/client.py +63 -0
  48. querdex-0.1.0/src/querdex/llm/fake_client.py +41 -0
  49. querdex-0.1.0/src/querdex/llm/openai_client.py +48 -0
  50. querdex-0.1.0/src/querdex/ops/__init__.py +5 -0
  51. querdex-0.1.0/src/querdex/ops/health.py +18 -0
  52. querdex-0.1.0/src/querdex/ops/observability.py +44 -0
  53. querdex-0.1.0/src/querdex/ops/retry.py +35 -0
  54. querdex-0.1.0/src/querdex/query/__init__.py +18 -0
  55. querdex-0.1.0/src/querdex/query/analyzer.py +69 -0
  56. querdex-0.1.0/src/querdex/query/answering.py +95 -0
  57. querdex-0.1.0/src/querdex/query/graph_walker.py +35 -0
  58. querdex-0.1.0/src/querdex/query/multi_doc.py +34 -0
  59. querdex-0.1.0/src/querdex/query/router.py +8 -0
  60. querdex-0.1.0/src/querdex/query/tiered_search.py +245 -0
  61. querdex-0.1.0/src/querdex/schemas/__init__.py +25 -0
  62. querdex-0.1.0/src/querdex/schemas/models.py +135 -0
  63. querdex-0.1.0/src/querdex/services/__init__.py +3 -0
  64. querdex-0.1.0/src/querdex/services/engine.py +647 -0
  65. querdex-0.1.0/src/querdex/services/interfaces.py +18 -0
  66. querdex-0.1.0/src/querdex/storage/__init__.py +4 -0
  67. querdex-0.1.0/src/querdex/storage/graph_store.py +91 -0
  68. querdex-0.1.0/src/querdex/storage/sqlite_store.py +714 -0
  69. querdex-0.1.0/src/querdex/utils/__init__.py +14 -0
  70. querdex-0.1.0/src/querdex/utils/llm_validation.py +60 -0
  71. querdex-0.1.0/src/querdex/utils/query_cluster.py +29 -0
  72. querdex-0.1.0/src/querdex/utils/tree_ops.py +28 -0
  73. querdex-0.1.0/tests/fixtures/eval/baseline_cases.json +17 -0
  74. querdex-0.1.0/tests/fixtures/eval/kpi_baseline.json +7 -0
  75. querdex-0.1.0/tests/fixtures/golden/parser_manifest.json +37 -0
  76. querdex-0.1.0/tests/fixtures/golden/sample.csv +3 -0
  77. querdex-0.1.0/tests/fixtures/golden/sample.html +6 -0
  78. querdex-0.1.0/tests/fixtures/golden/sample.md +7 -0
  79. querdex-0.1.0/tests/fixtures/golden/sample.py +7 -0
  80. querdex-0.1.0/tests/fixtures/golden/sample.txt +3 -0
  81. querdex-0.1.0/tests/test_adaptive_updater.py +121 -0
  82. querdex-0.1.0/tests/test_diff_reindex.py +109 -0
  83. querdex-0.1.0/tests/test_docx_integration.py +39 -0
  84. querdex-0.1.0/tests/test_entity_map_updater.py +26 -0
  85. querdex-0.1.0/tests/test_evaluation_harness.py +73 -0
  86. querdex-0.1.0/tests/test_graph_builder.py +73 -0
  87. querdex-0.1.0/tests/test_graph_store.py +21 -0
  88. querdex-0.1.0/tests/test_ingestion.py +61 -0
  89. querdex-0.1.0/tests/test_ingestion_additional.py +118 -0
  90. querdex-0.1.0/tests/test_llm_integration.py +282 -0
  91. querdex-0.1.0/tests/test_metrics_harness.py +81 -0
  92. querdex-0.1.0/tests/test_ocr_provider.py +53 -0
  93. querdex-0.1.0/tests/test_parser_golden.py +21 -0
  94. querdex-0.1.0/tests/test_pdf_integration.py +62 -0
  95. querdex-0.1.0/tests/test_pdf_parser.py +190 -0
  96. querdex-0.1.0/tests/test_pipeline.py +42 -0
  97. querdex-0.1.0/tests/test_query_analyzer.py +10 -0
  98. querdex-0.1.0/tests/test_query_graph_multi.py +84 -0
  99. querdex-0.1.0/tests/test_reindex.py +56 -0
  100. querdex-0.1.0/tests/test_routing_suite.py +32 -0
  101. querdex-0.1.0/tests/test_schemas.py +62 -0
  102. querdex-0.1.0/tests/test_store.py +70 -0
  103. querdex-0.1.0/tests/test_tree_builder.py +53 -0
  104. querdex-0.1.0/tests/test_tree_quality.py +60 -0
  105. querdex-0.1.0/uv.lock +815 -0
@@ -0,0 +1,10 @@
1
+ QUERDEX_DB=./index_store/querdex.db
2
+ QUERDEX_OCR_ENABLED=false
3
+ # QUERDEX_OCR_PROVIDER=tesseract
4
+ # QUERDEX_TESSERACT_CMD=tesseract
5
+ # QUERDEX_OCR_ENDPOINT=https://ocr.example.com/v1/ocr
6
+ # QUERDEX_OCR_API_KEY=replace_me
7
+ # QUERDEX_LLM_PROVIDER=anthropic
8
+ # QUERDEX_LLM_API_KEY=replace_me
9
+ # QUERDEX_LLM_TIER1_MODEL=claude-haiku-4-5-20251001
10
+ # QUERDEX_LLM_TIER2_MODEL=claude-sonnet-4-6
@@ -0,0 +1,25 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ pull_request:
6
+
7
+ jobs:
8
+ test:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+ - name: Setup Python
13
+ uses: actions/setup-python@v5
14
+ with:
15
+ python-version: '3.12'
16
+ - name: Install uv
17
+ run: pip install uv
18
+ - name: Sync deps
19
+ run: uv sync --extra dev
20
+ - name: Ruff
21
+ run: uv run --extra dev ruff check .
22
+ - name: MyPy
23
+ run: uv run --extra dev mypy src
24
+ - name: Tests
25
+ run: uv run --extra dev pytest
@@ -0,0 +1,48 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published] # triggers when you create a GitHub Release
6
+
7
+ jobs:
8
+ build:
9
+ name: Build distribution
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+
15
+ - name: Set up Python
16
+ uses: actions/setup-python@v5
17
+ with:
18
+ python-version: "3.11"
19
+
20
+ - name: Install build tools
21
+ run: pip install build
22
+
23
+ - name: Build wheel and sdist
24
+ run: python -m build
25
+
26
+ - name: Upload build artifacts
27
+ uses: actions/upload-artifact@v4
28
+ with:
29
+ name: dist
30
+ path: dist/
31
+
32
+ publish:
33
+ name: Publish to PyPI
34
+ needs: build
35
+ runs-on: ubuntu-latest
36
+ environment: pypi # matches the Environment name on PyPI form
37
+ permissions:
38
+ id-token: write # required for OIDC trusted publishing
39
+
40
+ steps:
41
+ - name: Download build artifacts
42
+ uses: actions/download-artifact@v4
43
+ with:
44
+ name: dist
45
+ path: dist/
46
+
47
+ - name: Publish to PyPI
48
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,35 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+
8
+ # Virtual env & tools
9
+ .venv/
10
+ .pytest_cache/
11
+ .mypy_cache/
12
+ .ruff_cache/
13
+
14
+ # Runtime data
15
+ index_store/
16
+ *.sqlite
17
+ *.db
18
+
19
+ # OS
20
+ .DS_Store
21
+
22
+ # Documents / paper drafts (not source code)
23
+ *.pdf
24
+ *.tex
25
+ *.docx
26
+
27
+ # Project management / planning
28
+ PROJECT_PLAN.md
29
+ TASK_LIST.md
30
+ RUNBOOK.md
31
+
32
+ # Secrets
33
+ .env
34
+ .env.*
35
+ !.env.example
querdex-0.1.0/Makefile ADDED
@@ -0,0 +1,13 @@
1
+ .PHONY: setup test lint typecheck
2
+
3
+ setup:
4
+ uv sync --extra dev
5
+
6
+ test:
7
+ uv run pytest
8
+
9
+ lint:
10
+ uv run ruff check .
11
+
12
+ typecheck:
13
+ uv run mypy src
querdex-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,384 @@
1
+ Metadata-Version: 2.4
2
+ Name: querdex
3
+ Version: 0.1.0
4
+ Summary: Reasoning-first document intelligence system
5
+ Author-email: Your Name <you@example.com>
6
+ License: MIT
7
+ Keywords: document,indexing,llm,rag,retrieval
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Classifier: Topic :: Text Processing :: Indexing
16
+ Requires-Python: >=3.11
17
+ Requires-Dist: beautifulsoup4>=4.12.3
18
+ Requires-Dist: markdown-it-py>=3.0.0
19
+ Requires-Dist: networkx>=3.4.2
20
+ Requires-Dist: pydantic>=2.9.0
21
+ Requires-Dist: pymupdf>=1.24.0
22
+ Requires-Dist: python-docx>=1.1.2
23
+ Provides-Extra: anthropic
24
+ Requires-Dist: anthropic>=0.49.0; extra == 'anthropic'
25
+ Provides-Extra: dev
26
+ Requires-Dist: mypy>=1.13.0; extra == 'dev'
27
+ Requires-Dist: pytest>=8.3.3; extra == 'dev'
28
+ Requires-Dist: ruff>=0.8.6; extra == 'dev'
29
+ Provides-Extra: openai
30
+ Requires-Dist: openai>=1.67.0; extra == 'openai'
31
+ Description-Content-Type: text/markdown
32
+
33
+ # Querdex
34
+
35
+ **Reasoning-first document intelligence system.**
36
+
37
+ Querdex indexes any document into a hierarchical tree, then uses a two-tier LLM search to answer questions with cited sources. It works without an LLM (keyword heuristics), and optionally plugs in Anthropic or OpenAI for higher-quality results.
38
+
39
+ ---
40
+
41
+ ## Table of Contents
42
+
43
+ - [How it works](#how-it-works)
44
+ - [Installation](#installation)
45
+ - [Quick Start (CLI)](#quick-start-cli)
46
+ - [LLM Setup](#llm-setup)
47
+ - [CLI Reference](#cli-reference)
48
+ - [Python API](#python-api)
49
+ - [Supported File Types](#supported-file-types)
50
+ - [Environment Variables](#environment-variables)
51
+
52
+ ---
53
+
54
+ ## How it works
55
+
56
+ ```
57
+ Document
58
+
59
+
60
+ Ingestion ──► parse into pages/sections (Section[])
61
+
62
+
63
+ Indexing ───► build hierarchical tree (TreeNode) + entity map + knowledge graph
64
+
65
+
66
+ Storage ────► persist to SQLite (sections, tree, entities, graph, query cache)
67
+
68
+
69
+ Query
70
+ ├─ Tier 1: LLM (or keyword) batch-prune of tree nodes
71
+ ├─ Tier 2: LLM (or heuristic) per-node relevance scoring
72
+ ├─ Retrieval: pull section text for selected nodes
73
+ └─ Answer: LLM synthesizes answer with source citations
74
+
75
+
76
+ Adaptive ───► update node summaries based on query feedback (runs in background)
77
+ ```
78
+
79
+ Three query routes are selected automatically:
80
+ - **single_doc** — standard hierarchical search on one document
81
+ - **multi_doc** — virtual super-tree across up to 3 documents
82
+ - **graph** — entity-seeded graph walk for relationship queries ("how does X relate to Y?")
83
+
84
+ ---
85
+
86
+ ## Installation
87
+
88
+ **Base install** (no LLM, uses keyword heuristics):
89
+ ```bash
90
+ pip install querdex
91
+ ```
92
+
93
+ **With Anthropic (Claude):**
94
+ ```bash
95
+ pip install querdex[anthropic]
96
+ ```
97
+
98
+ **With OpenAI (GPT):**
99
+ ```bash
100
+ pip install querdex[openai]
101
+ ```
102
+
103
+ **Development:**
104
+ ```bash
105
+ git clone <repo>
106
+ cd querdex
107
+ uv sync --extra dev
108
+ # or with an LLM provider:
109
+ uv sync --extra dev --extra anthropic
110
+ uv sync --extra dev --extra openai
111
+ ```
112
+
113
+ **Requirements:** Python 3.11+
114
+
115
+ ---
116
+
117
+ ## Quick Start (CLI)
118
+
119
+ ### 1. Index a document
120
+
121
+ ```bash
122
+ querdex index ./report.pdf --doc-id annual-report
123
+ ```
124
+
125
+ Output:
126
+ ```
127
+ Indexed doc_id=annual-report version=1
128
+ Nodes=12 max_depth=3
129
+ ```
130
+
131
+ ### 2. Query it
132
+
133
+ ```bash
134
+ querdex query --doc-id annual-report --query "What was the Q3 revenue?"
135
+ ```
136
+
137
+ Output:
138
+ ```
139
+ Query ID: 3f8a1c...
140
+ Intent: single_doc | Cache hit: False
141
+ Q3 revenue was $1.2B, up 8% year-over-year (Revenue Analysis, pages 4-6).
142
+ ```
143
+
144
+ ### 3. Multi-turn conversation (session)
145
+
146
+ ```bash
147
+ # First turn
148
+ querdex query --doc-id annual-report \
149
+ --query "What were the risk factors?" \
150
+ --session-id session_001
151
+
152
+ # Second turn — context from first turn is carried over
153
+ querdex query --doc-id annual-report \
154
+ --query "Which of those risks materialised?" \
155
+ --session-id session_001
156
+ ```
157
+
158
+ ### 4. Re-index an updated document
159
+
160
+ When the document changes, Querdex only rebuilds the affected parts:
161
+ ```bash
162
+ querdex index ./report_v2.pdf --doc-id annual-report
163
+ ```
164
+
165
+ ### 5. Delete a document
166
+
167
+ ```bash
168
+ querdex delete --doc-id annual-report
169
+ ```
170
+
171
+ ### Custom database path
172
+
173
+ By default the database is stored at `./index_store/querdex.db`. To change it:
174
+ ```bash
175
+ querdex --db /path/to/my.db index ./report.pdf --doc-id demo
176
+ querdex --db /path/to/my.db query --doc-id demo --query "summary?"
177
+ ```
178
+
179
+ ---
180
+
181
+ ## LLM Setup
182
+
183
+ Without any LLM configured, Querdex falls back to keyword/heuristic matching — it always produces an answer, just less precise.
184
+
185
+ ### Anthropic (Claude)
186
+
187
+ ```bash
188
+ export QUERDEX_LLM_PROVIDER=anthropic
189
+ export QUERDEX_LLM_API_KEY=sk-ant-...
190
+
191
+ # Optional: override model defaults
192
+ export QUERDEX_LLM_TIER1_MODEL=claude-haiku-4-5-20251001 # fast, cheap (batch prune)
193
+ export QUERDEX_LLM_TIER2_MODEL=claude-sonnet-4-6 # powerful (deep reasoning + answers)
194
+ ```
195
+
196
+ ### OpenAI (GPT)
197
+
198
+ ```bash
199
+ export QUERDEX_LLM_PROVIDER=openai
200
+ export QUERDEX_LLM_API_KEY=sk-...
201
+
202
+ # Optional: override model defaults
203
+ export QUERDEX_LLM_TIER1_MODEL=gpt-4o-mini # fast, cheap
204
+ export QUERDEX_LLM_TIER2_MODEL=gpt-4o # powerful
205
+ ```
206
+
207
+ **How the two tiers are used:**
208
+
209
+ | Tier | Model | Purpose |
210
+ |------|-------|---------|
211
+ | Tier 1 | cheap/fast | Single batched call to prune all tree nodes to the relevant few |
212
+ | Tier 2 | powerful | Per-node deep reasoning to confirm relevance + score confidence |
213
+ | Answer | powerful | Synthesise a cited answer from the retrieved section text |
214
+
215
+ ---
216
+
217
+ ## CLI Reference
218
+
219
+ ```
220
+ querdex [--db PATH] <command> [options]
221
+ ```
222
+
223
+ | Command | Description |
224
+ |---------|-------------|
225
+ | `index <file>` | Index a document. Auto-detects format from extension. |
226
+ | `query` | Query an indexed document. |
227
+ | `delete` | Remove a document and all its data from the store. |
228
+
229
+ ### `index`
230
+
231
+ ```
232
+ querdex index <file_path> [--doc-id ID]
233
+ ```
234
+
235
+ | Argument | Default | Description |
236
+ |----------|---------|-------------|
237
+ | `file_path` | required | Path to the document to index |
238
+ | `--doc-id` | auto-generated from filename+hash | Stable identifier for this document |
239
+
240
+ ### `query`
241
+
242
+ ```
243
+ querdex query --doc-id ID --query TEXT [--session-id ID]
244
+ ```
245
+
246
+ | Argument | Default | Description |
247
+ |----------|---------|-------------|
248
+ | `--doc-id` | required | Document to query |
249
+ | `--query` | required | Natural language question |
250
+ | `--session-id` | none | Enables multi-turn context (pass same ID across turns) |
251
+
252
+ ### `delete`
253
+
254
+ ```
255
+ querdex delete --doc-id ID
256
+ ```
257
+
258
+ ---
259
+
260
+ ## Python API
261
+
262
+ For integration into your own application:
263
+
264
+ ```python
265
+ import asyncio
266
+ from querdex.services import build_engine
267
+
268
+ # build_engine reads QUERDEX_LLM_* env vars automatically
269
+ engine = build_engine("./index_store/querdex.db")
270
+
271
+ # Index a document
272
+ doc = asyncio.run(engine.index_document("./report.pdf", doc_id="annual-report"))
273
+ print(f"Indexed: {doc.doc_id} | nodes={doc.stats.total_nodes}")
274
+
275
+ # Query
276
+ result = engine.query_document("annual-report", "What was Q3 revenue?")
277
+ print(result.answer)
278
+ print(f"Confidence: {result.confidence:.0%}")
279
+ for source in result.source_nodes:
280
+ print(f" Source: {source.title}, pages {source.pages}")
281
+
282
+ # Multi-turn query
283
+ result2 = engine.query_document(
284
+ "annual-report",
285
+ "What caused that increase?",
286
+ session_id="my-session-001",
287
+ )
288
+
289
+ # Re-index after the document changes
290
+ doc_v2 = asyncio.run(engine.reindex_document("./report_v2.pdf", doc_id="annual-report"))
291
+
292
+ # Delete
293
+ engine.store.delete_document("annual-report")
294
+
295
+ # Always close when done
296
+ engine.store.close()
297
+ ```
298
+
299
+ ### Passing an LLM client directly
300
+
301
+ ```python
302
+ from querdex.llm.anthropic_client import AnthropicLLMClient
303
+ from querdex.services.engine import QuerdexEngine
304
+ from querdex.storage import SQLiteStore
305
+
306
+ llm = AnthropicLLMClient(
307
+ api_key="sk-ant-...",
308
+ tier1_model="claude-haiku-4-5-20251001",
309
+ tier2_model="claude-sonnet-4-6",
310
+ )
311
+ store = SQLiteStore("./querdex.db")
312
+ engine = QuerdexEngine(store, llm_client=llm)
313
+ ```
314
+
315
+ ### Using the FakeLLMClient in tests
316
+
317
+ ```python
318
+ from querdex.llm.fake_client import FakeLLMClient
319
+ from querdex.query.answering import AnswerGenerator
320
+
321
+ fake = FakeLLMClient(
322
+ default='{"answer": "Revenue was $1.2B.", "confidence": 0.9}'
323
+ )
324
+ gen = AnswerGenerator(llm_client=fake)
325
+ answer, confidence, sources = gen.generate("What was revenue?", chunks)
326
+ ```
327
+
328
+ ---
329
+
330
+ ## Supported File Types
331
+
332
+ | Extension | Parser | Notes |
333
+ |-----------|--------|-------|
334
+ | `.txt` | TextParser | Plain text, split by paragraphs |
335
+ | `.md`, `.markdown` | MarkdownParser | Heading-aware section splitting |
336
+ | `.html`, `.htm` | HTMLParser | Strips tags, extracts text blocks |
337
+ | `.docx` | DOCXParser | Microsoft Word, paragraph-level |
338
+ | `.pdf` | PDFParser | Page-level; OCR optional (see below) |
339
+ | `.py` | PythonCodeParser | Function/class level chunking |
340
+ | `.js`, `.ts`, `.jsx`, `.tsx` | JSCodeParser | Function-level chunking |
341
+ | `.csv` | CSVParser | Row-batched sections |
342
+ | `.db`, `.sqlite` | SQLiteParser | Table-level sections |
343
+ | `.mp3`, `.wav`, `.m4a`, `.mp4`, `.mov` | AudioVideoParser | Transcript-based (requires Whisper or similar) |
344
+ | `.url` | URLParser | Fetches and parses the web page at that URL |
345
+ | URL string | URLParser | Pass a URL string directly as the file path |
346
+
347
+ ### PDF OCR
348
+
349
+ For scanned PDFs, enable OCR via environment variables:
350
+
351
+ ```bash
352
+ # Tesseract (local)
353
+ export QUERDEX_OCR_ENABLED=true
354
+ export QUERDEX_OCR_PROVIDER=tesseract # default when OCR enabled
355
+ export QUERDEX_TESSERACT_CMD=tesseract # path to tesseract binary
356
+
357
+ # Cloud OCR (custom endpoint)
358
+ export QUERDEX_OCR_ENABLED=true
359
+ export QUERDEX_OCR_PROVIDER=cloud
360
+ export QUERDEX_OCR_ENDPOINT=https://your-ocr-api.com/v1/ocr
361
+ export QUERDEX_OCR_API_KEY=your-key
362
+ ```
363
+
364
+ ---
365
+
366
+ ## Environment Variables
367
+
368
+ | Variable | Default | Description |
369
+ |----------|---------|-------------|
370
+ | `QUERDEX_LLM_PROVIDER` | _(none)_ | `anthropic` or `openai`. If unset, heuristic mode is used. |
371
+ | `QUERDEX_LLM_API_KEY` | _(none)_ | API key for the selected provider |
372
+ | `QUERDEX_LLM_TIER1_MODEL` | `claude-haiku-4-5-20251001` / `gpt-4o-mini` | Fast model for batch node pruning |
373
+ | `QUERDEX_LLM_TIER2_MODEL` | `claude-sonnet-4-6` / `gpt-4o` | Powerful model for deep reasoning and answers |
374
+ | `QUERDEX_OCR_ENABLED` | `false` | Enable OCR for scanned PDFs |
375
+ | `QUERDEX_OCR_PROVIDER` | `tesseract` | `tesseract` or `cloud` |
376
+ | `QUERDEX_TESSERACT_CMD` | `tesseract` | Path to Tesseract binary |
377
+ | `QUERDEX_OCR_ENDPOINT` | _(none)_ | Endpoint URL for cloud OCR provider |
378
+ | `QUERDEX_OCR_API_KEY` | _(none)_ | API key for cloud OCR provider |
379
+
380
+ ---
381
+
382
+ ## License
383
+
384
+ MIT