clichefactory 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. clichefactory-0.1.0/.gitignore +49 -0
  2. clichefactory-0.1.0/LICENSE +21 -0
  3. clichefactory-0.1.0/PKG-INFO +473 -0
  4. clichefactory-0.1.0/README.md +439 -0
  5. clichefactory-0.1.0/clichefactory/__about__.py +1 -0
  6. clichefactory-0.1.0/clichefactory/__init__.py +42 -0
  7. clichefactory-0.1.0/clichefactory/_config.py +164 -0
  8. clichefactory-0.1.0/clichefactory/_engine/__init__.py +6 -0
  9. clichefactory-0.1.0/clichefactory/_engine/adapters/__init__.py +0 -0
  10. clichefactory-0.1.0/clichefactory/_engine/adapters/csv_adapter.py +139 -0
  11. clichefactory-0.1.0/clichefactory/_engine/adapters/docling_adapter.py +280 -0
  12. clichefactory-0.1.0/clichefactory/_engine/adapters/eml_adapter.py +45 -0
  13. clichefactory-0.1.0/clichefactory/_engine/adapters/vlm_adapter.py +72 -0
  14. clichefactory-0.1.0/clichefactory/_engine/adapters/xlsx_adapter.py +142 -0
  15. clichefactory-0.1.0/clichefactory/_engine/ai_clients/__init__.py +20 -0
  16. clichefactory-0.1.0/clichefactory/_engine/ai_clients/anthropic_client.py +308 -0
  17. clichefactory-0.1.0/clichefactory/_engine/ai_clients/factory.py +104 -0
  18. clichefactory-0.1.0/clichefactory/_engine/ai_clients/gemini_client.py +424 -0
  19. clichefactory-0.1.0/clichefactory/_engine/ai_clients/json_utils.py +150 -0
  20. clichefactory-0.1.0/clichefactory/_engine/ai_clients/ollama_client.py +170 -0
  21. clichefactory-0.1.0/clichefactory/_engine/ai_clients/openai_client.py +257 -0
  22. clichefactory-0.1.0/clichefactory/_engine/ai_clients/prompts.py +21 -0
  23. clichefactory-0.1.0/clichefactory/_engine/ai_clients/protocol.py +71 -0
  24. clichefactory-0.1.0/clichefactory/_engine/ai_clients/tests/__init__.py +0 -0
  25. clichefactory-0.1.0/clichefactory/_engine/ai_clients/tests/test_gemini_usage_phases.py +32 -0
  26. clichefactory-0.1.0/clichefactory/_engine/ai_clients/usage_tracker.py +23 -0
  27. clichefactory-0.1.0/clichefactory/_engine/cache/__init__.py +10 -0
  28. clichefactory-0.1.0/clichefactory/_engine/cache/base_cacher.py +29 -0
  29. clichefactory-0.1.0/clichefactory/_engine/cache/file_system_cacher.py +49 -0
  30. clichefactory-0.1.0/clichefactory/_engine/config/__init__.py +5 -0
  31. clichefactory-0.1.0/clichefactory/_engine/config/base_config.py +58 -0
  32. clichefactory-0.1.0/clichefactory/_engine/contracts/document_metadata.py +24 -0
  33. clichefactory-0.1.0/clichefactory/_engine/contracts/fingerprinting.py +22 -0
  34. clichefactory-0.1.0/clichefactory/_engine/contracts/key_builder.py +47 -0
  35. clichefactory-0.1.0/clichefactory/_engine/contracts/model_schema.py +248 -0
  36. clichefactory-0.1.0/clichefactory/_engine/contracts/operations.py +28 -0
  37. clichefactory-0.1.0/clichefactory/_engine/contracts/payloads/deployment.py +24 -0
  38. clichefactory-0.1.0/clichefactory/_engine/contracts/payloads/inference.py +53 -0
  39. clichefactory-0.1.0/clichefactory/_engine/contracts/payloads/training.py +138 -0
  40. clichefactory-0.1.0/clichefactory/_engine/contracts/validators.py +45 -0
  41. clichefactory-0.1.0/clichefactory/_engine/extractors/__init__.py +12 -0
  42. clichefactory-0.1.0/clichefactory/_engine/metrics/__init__.py +5 -0
  43. clichefactory-0.1.0/clichefactory/_engine/metrics/metrics_config.py +39 -0
  44. clichefactory-0.1.0/clichefactory/_engine/metrics/metrics_config_example.yaml +22 -0
  45. clichefactory-0.1.0/clichefactory/_engine/models/document_model.py +52 -0
  46. clichefactory-0.1.0/clichefactory/_engine/models/normalized_doc.py +60 -0
  47. clichefactory-0.1.0/clichefactory/_engine/models/usage_summary.py +11 -0
  48. clichefactory-0.1.0/clichefactory/_engine/parsers/csv_parser.py +204 -0
  49. clichefactory-0.1.0/clichefactory/_engine/parsers/doc_parser.py +28 -0
  50. clichefactory-0.1.0/clichefactory/_engine/parsers/docling_pipeline_options.py +7 -0
  51. clichefactory-0.1.0/clichefactory/_engine/parsers/docx_parser.py +37 -0
  52. clichefactory-0.1.0/clichefactory/_engine/parsers/eml_parser.py +183 -0
  53. clichefactory-0.1.0/clichefactory/_engine/parsers/fallback_media_parser.py +100 -0
  54. clichefactory-0.1.0/clichefactory/_engine/parsers/image_parser.py +76 -0
  55. clichefactory-0.1.0/clichefactory/_engine/parsers/media_parser.py +88 -0
  56. clichefactory-0.1.0/clichefactory/_engine/parsers/media_parser_registry.py +79 -0
  57. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/eml_utils.py +181 -0
  58. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/image/__init__.py +6 -0
  59. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/image/image_pipeline.py +158 -0
  60. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/image/image_pipeline_options.py +43 -0
  61. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/image/parsers/__init__.py +13 -0
  62. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/image/parsers/docling.py +47 -0
  63. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/image/parsers/ocr_llm.py +74 -0
  64. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/image/parsers/pytesseract.py +47 -0
  65. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/image/parsers/rapidocr.py +47 -0
  66. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/lang_mapping.py +127 -0
  67. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/layout/__init__.py +4 -0
  68. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/media_router.py +30 -0
  69. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/media_type_detector.py +197 -0
  70. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/office_converter.py +125 -0
  71. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/pdf/__init__.py +5 -0
  72. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/pdf/classifier.py +58 -0
  73. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/pdf/docling_helpers.py +448 -0
  74. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/pdf/docling_pipeline_options.py +105 -0
  75. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/pdf/strategies/__init__.py +13 -0
  76. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/pdf/strategies/docling_baseline.py +79 -0
  77. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/pdf/strategies/docling_vlm.py +160 -0
  78. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/pdf/strategies/ocr_llm.py +97 -0
  79. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/pdf/strategies/pymupdf_structured.py +118 -0
  80. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/pdf_repair.py +102 -0
  81. clichefactory-0.1.0/clichefactory/_engine/parsers/parser_utils/prompts.py +97 -0
  82. clichefactory-0.1.0/clichefactory/_engine/parsers/pdf_parser.py +130 -0
  83. clichefactory-0.1.0/clichefactory/_engine/parsers/text_parser.py +51 -0
  84. clichefactory-0.1.0/clichefactory/_engine/parsers/xlsx_parser.py +310 -0
  85. clichefactory-0.1.0/clichefactory/_extract_finalize.py +41 -0
  86. clichefactory-0.1.0/clichefactory/_extract_validation.py +49 -0
  87. clichefactory-0.1.0/clichefactory/_local.py +488 -0
  88. clichefactory-0.1.0/clichefactory/_schema.py +248 -0
  89. clichefactory-0.1.0/clichefactory/_service.py +312 -0
  90. clichefactory-0.1.0/clichefactory/_service_url.py +23 -0
  91. clichefactory-0.1.0/clichefactory/_upload.py +244 -0
  92. clichefactory-0.1.0/clichefactory/_utils.py +146 -0
  93. clichefactory-0.1.0/clichefactory/cli.py +652 -0
  94. clichefactory-0.1.0/clichefactory/cliche.py +305 -0
  95. clichefactory-0.1.0/clichefactory/client.py +364 -0
  96. clichefactory-0.1.0/clichefactory/errors.py +61 -0
  97. clichefactory-0.1.0/clichefactory/types.py +131 -0
  98. clichefactory-0.1.0/pyproject.toml +57 -0
  99. clichefactory-0.1.0/tests/__init__.py +0 -0
  100. clichefactory-0.1.0/tests/test_allow_partial.py +123 -0
  101. clichefactory-0.1.0/tests/test_boundary_enforcement.py +154 -0
  102. clichefactory-0.1.0/tests/test_cli.py +265 -0
  103. clichefactory-0.1.0/tests/test_clichefactory_sdk.py +59 -0
  104. clichefactory-0.1.0/tests/test_csv_parser.py +55 -0
  105. clichefactory-0.1.0/tests/test_docling_table_context_snippet.py +88 -0
  106. clichefactory-0.1.0/tests/test_fallback_and_parallel_parsers.py +84 -0
  107. clichefactory-0.1.0/tests/test_lang_mapping.py +151 -0
  108. clichefactory-0.1.0/tests/test_ollama_client.py +45 -0
@@ -0,0 +1,49 @@
1
+ # ── Python ────────────────────────────────────────────────────────────────────
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.pyo
5
+ *.pyd
6
+ *.so
7
+
8
+ # ── Build & packaging ─────────────────────────────────────────────────────────
9
+ build/
10
+ dist/
11
+ *.egg-info/
12
+ *.egg
13
+ MANIFEST
14
+ .eggs/
15
+
16
+ # ── Virtual environments ───────────────────────────────────────────────────────
17
+ .venv/
18
+ venv/
19
+ env/
20
+ .python-version
21
+
22
+ # ── uv ────────────────────────────────────────────────────────────────────────
23
+ # uv.lock is intentionally NOT committed for libraries (only apps pin lockfiles)
24
+ uv.lock
25
+
26
+ # ── Testing & coverage ────────────────────────────────────────────────────────
27
+ .pytest_cache/
28
+ .cache/
29
+ .coverage
30
+ .coverage.*
31
+ htmlcov/
32
+ coverage.xml
33
+
34
+ # ── Type checking & linting ───────────────────────────────────────────────────
35
+ .mypy_cache/
36
+ .ruff_cache/
37
+ .dmypy.json
38
+
39
+ # ── IDE / editor ──────────────────────────────────────────────────────────────
40
+ .vscode/
41
+ .idea/
42
+ *.swp
43
+ *.swo
44
+ .DS_Store
45
+
46
+ # ── Secrets & local config ────────────────────────────────────────────────────
47
+ .env
48
+ .env.*
49
+ *.env
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Urban Susnik s.p.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,473 @@
1
+ Metadata-Version: 2.4
2
+ Name: clichefactory
3
+ Version: 0.1.0
4
+ Summary: Python SDK and CLI for ClicheFactory — structured data extraction from documents
5
+ Project-URL: Homepage, https://clichefactory.com
6
+ Project-URL: Repository, https://github.com/nouveau-rubynho/clichefactory-sdk
7
+ Author-email: "Urban Susnik s.p." <info@clichefactory.com>
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Keywords: cli,document-extraction,llm,ocr,pdf,pydantic
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Requires-Python: >=3.12
15
+ Requires-Dist: anthropic>=0.84.0
16
+ Requires-Dist: anyio<5,>=4.0.0
17
+ Requires-Dist: google-genai>=1.66.0
18
+ Requires-Dist: httpx<0.29,>=0.28.0
19
+ Requires-Dist: jsonschema>=4.22.0
20
+ Requires-Dist: openai<3,>=2.29.0
21
+ Requires-Dist: pikepdf>=10.3.0
22
+ Requires-Dist: pydantic<3,>=2.8.0
23
+ Provides-Extra: local
24
+ Requires-Dist: docling<2.71.0,>=2.67.0; extra == 'local'
25
+ Requires-Dist: openpyxl<3.3.0,>=3.1.0; extra == 'local'
26
+ Requires-Dist: pillow<11.0.0,>=10.0.0; extra == 'local'
27
+ Requires-Dist: pymupdf<1.26.0,>=1.25.0; extra == 'local'
28
+ Requires-Dist: pypdf<4.2.0,>=4.0.0; extra == 'local'
29
+ Requires-Dist: pytesseract<0.3.15,>=0.3.10; extra == 'local'
30
+ Requires-Dist: python-docx<1.4.0,>=1.2.0; extra == 'local'
31
+ Requires-Dist: rapidocr<4.0.0,>=3.3.0; extra == 'local'
32
+ Requires-Dist: tesserocr<2.9.0,>=2.7.0; extra == 'local'
33
+ Description-Content-Type: text/markdown
34
+
35
+ # ClicheFactory (Python SDK)
36
+
37
+ ## Introduction
38
+
39
+ ClicheFactory is a structured data extraction SDK. It parses documents (PDF, images, Office, email, etc.) and extracts structured data into Pydantic models — locally with your own LLM keys, or via the ClicheFactory service. Training is managed through [ClicheFactory](https://clichefactory.com); the SDK consumes trained artifacts via `artifact_id`.
40
+
41
+ ## Installing
42
+
43
+ ```bash
44
+ pip install clichefactory
45
+ ```
46
+
47
+ For local parsing/OCR (Docling/PyMuPDF/Tesseract/etc.):
48
+
49
+ ```bash
50
+ pip install "clichefactory[local]"
51
+ ```
52
+
53
+ ## Quickstart
54
+
55
+ ### Local extraction from text
56
+
57
+ ```python
58
+ from pydantic import BaseModel
59
+ from clichefactory import Endpoint, factory
60
+
61
+ class Invoice(BaseModel):
62
+ invoice_number: str | None = None
63
+ total_amount: float | None = None
64
+
65
+ client = factory(
66
+ mode="local",
67
+ model=Endpoint(provider_model="gemini/gemini-3-flash-preview", api_key="..."),
68
+ )
69
+
70
+ c = client.cliche(Invoice)
71
+ invoice = c.extract(text="Invoice #123 total 99.00 EUR")
72
+ print(invoice)
73
+ ```
74
+
75
+ Local mode does **not** pick a default model: you must pass `model=Endpoint(...)` (or `llm=` for compatibility) **or** set `LLM_MODEL_NAME` and `LLM_API_KEY` in the environment. If parsing options use OCR LLM fallback or VLM refinement, configure an OCR LLM the same way (`ocr_model` / `OCR_MODEL_*` env vars) or disable those fallbacks (see [`ParsingOptions`](#parsingoptions)).
76
+
77
+ ### Local extraction from file
78
+
79
+ Requires `clichefactory[local]`. Parses the document (OCR if needed), converts to markdown, then extracts structured data via the LLM.
80
+
81
+ ```python
82
+ invoice = c.extract(file="/path/to/invoice.pdf")
83
+ ```
84
+
85
+ ### Fast extraction (file bytes direct to LLM)
86
+
87
+ Skips OCR/parsing entirely — sends the raw file to a multimodal LLM.
88
+
89
+ ```python
90
+ invoice = c.extract(file="invoice.pdf", mode="fast")
91
+ ```
92
+
93
+ ### Service mode (SaaS)
94
+
95
+ ```python
96
+ from clichefactory import factory
97
+
98
+ client = factory(api_key="cliche-...") # mode defaults to "service"
99
+
100
+ c = client.cliche(Invoice)
101
+ invoice = c.extract(file="/path/to/invoice.pdf")
102
+ ```
103
+
104
+ **Service URL:** By default the SDK uses `http://127.0.0.1:4000` (local aio-server). For production, set the environment variable **`CLICHEFACTORY_API_URL`** to `https://api.clichefactory.com`, or pass **`base_url=`** to `factory()` explicitly (this overrides the env var).
105
+
106
+ Local paths (and raw bytes) are automatically uploaded by the SDK before the service processes them.
107
+
108
+ ### Trained extraction
109
+
110
+ Training is done through [ClicheFactory](https://ClicheFactory.com). Once you have a trained artifact, use it via `artifact_id`:
111
+
112
+ ```python
113
+ from clichefactory import factory
114
+
115
+ client = factory(api_key="cliche-...")
116
+ cliche = client.cliche(Invoice, artifact_id="art_8cee...")
117
+ result = cliche.extract(file="document.pdf")
118
+ ```
119
+
120
+ **API keys:** Use a key from **ClicheFactory → Settings → API Keys** (`cliche-...`). Those keys authenticate as your account and are billed against your credits. They are not the same as internal aio-server operator keys used between services.
121
+
122
+ **BYOK vs hosted (service mode):**
123
+
124
+ - **BYOK** — Pass `model=Endpoint(..., api_key=...)` (and optionally `ocr_model=`) so extraction/OCR use your LLM credentials. Billing uses the BYOK rate.
125
+ - **Hosted** — Omit `model` / `ocr_model` so the platform runs the LLMs. Your Pydantic schema must still match the trained pipeline’s output shape (as exported from ClicheFactory).
126
+
127
+ **Explicit `mode` vs artifact default:** You can pass `mode=` (e.g. `mode="trained"`) or omit it. When the artifact defines a pipeline mode (e.g. `robust-trained`), the service can apply that mode automatically if you do not override it.
128
+
129
+ **`robust-trained`:** Requires an artifact trained with the verification pipeline (`VerifiedExtractor`). If you only trained a single-step extractor, use default extraction or `mode="trained"` instead of forcing `robust-trained`.
130
+
131
+ ## Extraction modes
132
+
133
+ | Mode | Local | Service | Description |
134
+ |------|-------|---------|-------------|
135
+ | `None` (default) | yes | yes | Parse document -> markdown -> LLM extraction |
136
+ | `"fast"` | yes | yes | Send raw file bytes directly to LLM (no OCR) |
137
+ | `"trained"` | - | yes | Uses a trained artifact (DSPy `BaseExtractor` on OCR text) |
138
+ | `"robust"` | - | yes | Untrained extract + verify (two-stage) |
139
+ | `"robust-trained"` | - | yes | Trained extract + verify; **artifact must be trained for verification** |
140
+
141
+ ```python
142
+ invoice = c.extract(file="/path/to/invoice.pdf", mode="robust")
143
+ ```
144
+
145
+ ## Document to markdown
146
+
147
+ Convert any supported file to a structured markdown representation.
148
+
149
+ ```python
150
+ doc = client.to_markdown(file="invoice.pdf")
151
+ print(doc.get_markdown())
152
+ print(doc.get_pages())
153
+ ```
154
+
155
+ Service mode (set `mode="service"` on the call; it is separate from `factory(mode=...)`):
156
+
157
+ ```python
158
+ doc = client.to_markdown(file="invoice.pdf", mode="service")
159
+
160
+ # Fast mode (VLM-only, no parser pipeline)
161
+ doc = client.to_markdown(file="invoice.pdf", mode="service", parser="fast")
162
+ ```
163
+
164
+ The returned document object provides:
165
+ - `get_markdown()` — full markdown text
166
+ - `get_plain_text()` — plain text without formatting
167
+ - `get_pages()` — list of page objects
168
+ - `get_sections()` — list of section objects
169
+ - `get_tables()` — list of table objects
170
+ - `get_images()` — list of image objects
171
+
172
+ Not every pipeline has all of these options.
173
+
174
+ ## Batch operations
175
+
176
+ Process multiple files concurrently with configurable parallelism.
177
+
178
+ ### Batch extraction
179
+
180
+ ```python
181
+ results = c.extract_batch(
182
+ files=["./data/doc1.pdf", "./data/doc2.pdf", "./data/doc3.pdf"],
183
+ max_concurrency=5,
184
+ mode="fast",
185
+ )
186
+ for invoice in results:
187
+ print(invoice.vendor_name, invoice.total_with_vat)
188
+ ```
189
+
190
+ ### Batch markdown
191
+
192
+ ```python
193
+ docs = client.to_markdown_batch(
194
+ files=["a.pdf", "b.pdf", "c.pdf"],
195
+ max_concurrency=5,
196
+ )
197
+ for doc in docs:
198
+ print(len(doc.get_markdown()), "chars")
199
+ ```
200
+
201
+ Service mode (presign + OCR on the server for each file):
202
+
203
+ ```python
204
+ docs = client.to_markdown_batch(
205
+ files=["a.pdf", "b.pdf"],
206
+ mode="service",
207
+ max_concurrency=5,
208
+ )
209
+ ```
210
+
211
+ ## SaaS pricing (service mode)
212
+
213
+ Billing applies only when using **`mode="service"`** with a ClicheFactory API key. Local runs are not metered by the platform.
214
+
215
+ **Free tier**
216
+
217
+ - **10** lifetime extraction pages (metered per processed page). Those pages are free regardless of full-service vs BYOK.
218
+
219
+ **Paid usage** (credit balance)
220
+
221
+ - After free extraction pages are exhausted, extraction is billed per page from your balance.
222
+ - **Full-service** means the platform runs the LLMs. **BYOK** (bring your own key) applies when you supply your own LLM API key on the client (for example via `Endpoint(..., api_key=...)` or envelope config as implemented in the SDK).
223
+
224
+ Default rates (USD; the API may override these per deployment via stored rate rows):
225
+
226
+ | Operation | Full-service | BYOK |
227
+ |-----------|--------------|------|
228
+ | Extraction (per page) | $0.005 | $0.0005 |
229
+ | Training | via [ClicheFactory](https://clichefactory.com) | via [ClicheFactory](https://clichefactory.com) |
230
+
231
+ ## Configuration
232
+
233
+ ### `Endpoint` (BYOK LLM config)
234
+
235
+ ```python
236
+ from clichefactory import Endpoint
237
+
238
+ model = Endpoint(
239
+ provider_model="gemini/gemini-3-flash-preview",
240
+ api_key="...",
241
+ max_tokens=100000,
242
+ temperature=1.0,
243
+ num_retries=8,
244
+ api_base=None, # for Ollama: "http://localhost:11434"
245
+ )
246
+
247
+ client = factory(mode="local", model=model)
248
+ ```
249
+
250
+ ### Advanced multi-model overrides
251
+
252
+ Most users should set only `model`. If you need role-specific endpoints, override per role:
253
+
254
+ ```python
255
+ client = factory(
256
+ mode="service",
257
+ api_key="cliche-...",
258
+ model=Endpoint(provider_model="gemini/gemini-3-flash-preview", api_key="..."), # extraction default
259
+ ocr_model=Endpoint(provider_model="gemini/gemini-3-flash-preview", api_key="..."), # optional
260
+ )
261
+ ```
262
+
263
+ Per-call overrides are also available:
264
+
265
+ ```python
266
+ invoice = c.extract(file="/path/to/invoice.pdf", model=Endpoint(...), ocr_model=Endpoint(...))
267
+ ```
268
+
269
+ ### `ParsingOptions`
270
+
271
+ Fine-grained control over local-mode document parsing. `ParsingOptions` only applies to local extraction — in service mode the platform selects the optimal parsing strategy and this parameter is ignored.
272
+
273
+ ```python
274
+ from clichefactory import ParsingOptions
275
+
276
+ parsing = ParsingOptions(
277
+ pdf_image_parser="docling", # "docling", "docling_vlm", "vision_layout" (SaaS-only)
278
+ pdf_fallback_to_ocr_llm=True, # fall back to LLM OCR when local parser fails
279
+ pdf_structured_fallback_to_image=False, # retry structured PDFs as image-scanned on failure
280
+ pdf_ocr_engine="rapidocr", # "rapidocr", "tesseract", "easyocr"
281
+ pdf_ocr_lang="eng", # language code(s), see OCR language section below
282
+ use_ocr_llm_body=True, # use LLM for body text when parser supports it
283
+
284
+ image_parser="rapidocr", # "rapidocr", "pytesseract", "docling", "ocr_llm"
285
+ image_parser_fallback=True, # fall back to ocr_llm on failure
286
+ image_parser_lang="eng", # language code(s), see OCR language section below
287
+ )
288
+
289
+ client = factory(mode="local", model=model, parsing=parsing)
290
+ ```
291
+
292
+ ### Environment variables
293
+
294
+ For **local** runs, the primary extraction defaults are:
295
+
296
+ | Role | Variables | Notes |
297
+ |------|-----------|--------|
298
+ | Extraction LLM | `LLM_MODEL_NAME`, `LLM_API_KEY` | Also accepted: `MODEL_NAME` / `MODEL_API_KEY`, `EXTRACTION_LLM_MODEL_NAME` / `EXTRACTION_LLM_API_KEY`. **No implicit default model** — if unset, local extraction fails until you configure a model. |
299
+ | OCR LLM (optional) | `OCR_MODEL_NAME`, `OCR_MODEL_API_KEY` | Used when you set a separate OCR endpoint; otherwise OCR reuses the extraction model when your parsing options need an OCR LLM. Aliases include `OCR_LLM_MODEL_NAME` / `OCR_LLM_API_KEY` and `OCR_API_KEY`. |
300
+
301
+ Optional endpoints override extraction/OCR on `factory()` via `model` and `ocr_model`.
302
+
303
+ For **service** mode, the only URL-related environment variable is **`CLICHEFACTORY_API_URL`** (unless you pass `base_url=` to `factory()`, which wins).
304
+
305
+ ### Ollama (local model inference)
306
+
307
+ ```bash
308
+ curl -fsSL https://ollama.com/install.sh | sh
309
+ ollama run llama3.2:1b
310
+ ```
311
+
312
+ ```python
313
+ client = factory(
314
+ mode="local",
315
+ model=Endpoint(provider_model="ollama/llama3.2:1b", api_key="", api_base="http://localhost:11434"),
316
+ )
317
+ ```
318
+
319
+ Current scope: Ollama supports text extraction only (`extract(text=...)`). File parsing and OCR paths are not supported for Ollama.
320
+
321
+ ## PDF parser selection
322
+
323
+ | Parser | Config value | Description |
324
+ |--------|-------------|-------------|
325
+ | Docling | `"docling"` | Local OCR + table structure via Docling (default). Full structured output. |
326
+ | VLM direct | `"fast"` extraction mode | Sends the whole PDF to the LLM. No layout structure, fastest. |
327
+ | Docling + VLM | `"docling_vlm"` | Docling for structure + per-page VLM refinement. |
328
+ | Vision Layout | `"vision_layout"` | More performant layout detection. **SaaS-only**. |
329
+
330
+ Set via `ParsingOptions(pdf_image_parser=...)` or on `factory(parsing=...)`.
331
+
332
+ ### OCR LLM fallback for Docling-based parsers
333
+
334
+ Docling-based parsers can fall back to **OCR LLM** (your configured vision-capable model) when Docling produces empty or degenerate output. Controlled by `pdf_fallback_to_ocr_llm` (default `True`). That path requires a configured OCR LLM (or the same model as extraction).
335
+
336
+ ### Parallel OCR LLM refinement calls
337
+
338
+ VLM-oriented parsers (e.g. `docling_vlm`) can issue multiple parallel **OCR LLM** calls per document (per-page or per-table) to keep latency under control.
339
+
340
+ ## ClicheFactory UI integration
341
+
342
+ Documents extracted via the SDK appear in [ClicheFactory](https://ClicheFactory.com) **only when
343
+ you set both `project` and `task`** on the factory. Documents without explicit
344
+ scope are extraction-only and won't appear in the ClicheFactory UI.
345
+
346
+ ```python
347
+ from clichefactory import factory
348
+
349
+ client = factory(
350
+ api_key="cliche-...",
351
+ project="42", # ClicheFactory Project ID (visible in URL: /projects/42/)
352
+ task="108", # ClicheFactory Batch ID (visible in URL: /batch/108/)
353
+ )
354
+
355
+ # Extractions will appear under that project/batch in ClicheFactory
356
+ result = client.cliche(MySchema).extract(file="document.pdf")
357
+ ```
358
+
359
+ Documents sync automatically every ~30 minutes, or immediately via the
360
+ "Sync from SDK" button in the ClicheFactory UI.
361
+
362
+ If you omit `project`/`task`, extraction works normally — your data just
363
+ won't be visible in ClicheFactory.
364
+
365
+ **Tenant id (HTTP APIs):** User API keys resolve to a **tenant id** stored with the key (typically your ClicheFactory user id as a string, e.g. `"1"`). Envelope `tenant_id="default"` is rewritten server-side to that tenant for inference. When calling aio-server REST endpoints directly (e.g. listing documents), pass **`tenant_id` matching your key’s tenant**, not the literal string `"default"`, or the request will be rejected.
366
+
367
+ ## OCR language configuration
368
+
369
+ Languages are specified using **Tesseract format** everywhere — the SDK converts internally for each engine. Use `+` to combine multiple languages (e.g. `"slv+eng"` for Slovenian + English).
370
+
371
+ ```python
372
+ parsing = ParsingOptions(
373
+ pdf_ocr_lang="deu+eng", # German + English for PDFs
374
+ image_parser_lang="fra", # French for images
375
+ )
376
+ ```
377
+
378
+ The default language is `"eng"` (English).
379
+
380
+ ### How languages work per OCR engine
381
+
382
+ | Engine | Config value | Language handling | System dependency |
383
+ |--------|-------------|-------------------|-------------------|
384
+ | **Tesseract** | `pdf_ocr_engine="tesseract"` / `image_parser="pytesseract"` | Uses Tesseract format directly (`"slv+eng"`). Requires matching `.traineddata` files under `$TESSDATA_PREFIX`. | Tesseract binary on `PATH` |
385
+ | **RapidOCR** | `pdf_ocr_engine="rapidocr"` / `image_parser="rapidocr"` | Maps language to a script family (e.g. `"eng"` → English model, `"deu"` → Latin model). No per-language model download needed. | None (pure Python, ONNX) |
386
+ | **EasyOCR** | `pdf_ocr_engine="easyocr"` / `image_parser="easyocr"` | Converts to ISO 639-1 codes (e.g. `"eng"` → `"en"`, `"deu"` → `"de"`). Downloads per-language models on first use. | None (pure Python, PyTorch) |
387
+ | **Docling** | `image_parser="docling"` | Uses Docling's built-in image conversion. No language parameter. | None |
388
+ | **OCR LLM** | `image_parser="ocr_llm"` | VLM-based — the model handles language detection automatically. | Configured `ocr_model` |
389
+
390
+ ### Common language codes
391
+
392
+ | Language | Code | Notes |
393
+ |----------|------|-------|
394
+ | English | `eng` | Default |
395
+ | German | `deu` | |
396
+ | French | `fra` | |
397
+ | Spanish | `spa` | |
398
+ | Italian | `ita` | |
399
+ | Slovenian | `slv` | |
400
+ | Polish | `pol` | |
401
+ | Russian | `rus` | Cyrillic script |
402
+ | Chinese (Simplified) | `chi_sim` | |
403
+ | Japanese | `jpn` | |
404
+ | Korean | `kor` | |
405
+ | Arabic | `ara` | RTL script |
406
+
407
+ Multi-language example: `"slv+eng"` (Slovenian + English), `"deu+fra"` (German + French).
408
+
409
+ ### RapidOCR script families
410
+
411
+ RapidOCR operates at the script level, not individual languages. Multiple Latin-script languages (German, French, Slovenian, etc.) all map to the `latin` model. The SDK handles this mapping automatically — you still specify languages in Tesseract format.
412
+
413
+ | Script family | Covers |
414
+ |--------------|--------|
415
+ | `en` | English (dedicated model) |
416
+ | `latin` | German, French, Spanish, Italian, Slovenian, Polish, etc. |
417
+ | `cyrillic` | Russian, Ukrainian, Bulgarian, Serbian |
418
+ | `ch` | Chinese (Simplified) |
419
+ | `japan` | Japanese |
420
+ | `korean` | Korean |
421
+ | `arabic` | Arabic |
422
+ | `devanagari` | Hindi, Bengali |
423
+
424
+ ## Local parsing dependencies
425
+
426
+ ### DOC/ODT conversion
427
+
428
+ For legacy Office files (`.doc`, `.odt`), the parser converts files to PDF first, then processes them through the PDF pipeline.
429
+
430
+ This requires external system tools if you run it locally and not in service mode:
431
+
432
+ - `pandoc` for general Office -> PDF conversion
433
+ - `LibreOffice` (`soffice`) for legacy `.doc` conversion
434
+
435
+ If these tools are missing, `.doc`/`.odt` parsing will fail at runtime.
436
+
437
+ ### Tesseract OCR
438
+
439
+ If using a Docling Tesseract-based OCR engine, ensure:
440
+
441
+ - Tesseract is installed and on `PATH`
442
+ - The language data directory is configured via `TESSDATA_PREFIX`
443
+
444
+ ```bash
445
+ # macOS with Homebrew
446
+ export TESSDATA_PREFIX="/opt/homebrew/opt/tesseract/share/tessdata"
447
+ ```
448
+
449
+ Languages configured in `pdf_ocr_lang` must have matching `.traineddata` files under `$TESSDATA_PREFIX`.
450
+
451
+ ### RapidOCR font
452
+
453
+ Docling uses RapidOCR which may try to download a font (FZYTK.TTF) at runtime. Set a local font path to avoid this:
454
+
455
+ ```bash
456
+ export DOCLING_OCR_FONT_PATH="/path/to/a/unicode.ttf"
457
+ ```
458
+
459
+ On macOS, a system font is usually available automatically when this variable is unset.
460
+ On Linux/Windows or restricted environments, setting `DOCLING_OCR_FONT_PATH` is recommended.
461
+
462
+ ## Supported file types
463
+
464
+ | Extension(s) | Parser | Notes |
465
+ |-------------|--------|-------|
466
+ | `.pdf` | PdfRouterParser | Classifies structured vs scanned, routes accordingly |
467
+ | `.png`, `.jpg`, `.jpeg`, `.webp`, `.gif`, `.bmp` | ImageRouterParser | Routes to configured image parser |
468
+ | `.docx` | DocxParser | Via Docling |
469
+ | `.doc`, `.odt` | DocParser|
470
+ | `.xlsx` | XlsxParser |
471
+ | `.csv` | CsvParser | Auto-detect delimiter and header |
472
+ | `.eml` | EmlParser | RFC 2822 with recursive attachment parsing |
473
+ | `.txt`, `.md` | TextParser | Passthrough with encoding detection |