entityxtract 0.5.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. entityxtract-0.5.2/.env.sample +3 -0
  2. entityxtract-0.5.2/.github/workflows/publish.yml +41 -0
  3. entityxtract-0.5.2/.gitignore +20 -0
  4. entityxtract-0.5.2/.python-version +1 -0
  5. entityxtract-0.5.2/LICENSE +21 -0
  6. entityxtract-0.5.2/PKG-INFO +320 -0
  7. entityxtract-0.5.2/README.md +289 -0
  8. entityxtract-0.5.2/docs/assets/entityxtract_flow.png +0 -0
  9. entityxtract-0.5.2/docs/assets/logo.png +0 -0
  10. entityxtract-0.5.2/memory-bank/activeContext.md +107 -0
  11. entityxtract-0.5.2/memory-bank/productContext.md +160 -0
  12. entityxtract-0.5.2/memory-bank/progress.md +144 -0
  13. entityxtract-0.5.2/memory-bank/projectbrief.md +182 -0
  14. entityxtract-0.5.2/memory-bank/systemPatterns.md +206 -0
  15. entityxtract-0.5.2/memory-bank/techContext.md +86 -0
  16. entityxtract-0.5.2/pyproject.toml +44 -0
  17. entityxtract-0.5.2/src/entityxtract/__init__.py +0 -0
  18. entityxtract-0.5.2/src/entityxtract/config.py +46 -0
  19. entityxtract-0.5.2/src/entityxtract/extractor.py +433 -0
  20. entityxtract-0.5.2/src/entityxtract/extractor_types.py +254 -0
  21. entityxtract-0.5.2/src/entityxtract/logging_config.py +118 -0
  22. entityxtract-0.5.2/src/entityxtract/pdf/__init__.py +0 -0
  23. entityxtract-0.5.2/src/entityxtract/pdf/converter.py +95 -0
  24. entityxtract-0.5.2/src/entityxtract/pdf/extractor.py +141 -0
  25. entityxtract-0.5.2/src/entityxtract/prompts/__init__.py +33 -0
  26. entityxtract-0.5.2/src/entityxtract/prompts/string.txt +36 -0
  27. entityxtract-0.5.2/src/entityxtract/prompts/system.txt +2 -0
  28. entityxtract-0.5.2/src/entityxtract/prompts/table.txt +38 -0
  29. entityxtract-0.5.2/tests/__init__.py +0 -0
  30. entityxtract-0.5.2/tests/data/attention-is-all-you-need.pdf +0 -0
  31. entityxtract-0.5.2/tests/test.py +139 -0
  32. entityxtract-0.5.2/tests/utils_io.py +25 -0
  33. entityxtract-0.5.2/uv.lock +1362 -0
@@ -0,0 +1,3 @@
1
+ OPENAI_API_KEY: "your-api-key"
2
+ OPENAI_API_BASE: "https://openrouter.ai/api/v1"
3
+ OPENAI_DEFAULT_MODEL: "google/gemini-2.5-flash"
@@ -0,0 +1,41 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ build:
9
+ name: Build distribution
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+
14
+ - name: Install uv
15
+ uses: astral-sh/setup-uv@v5
16
+
17
+ - name: Build package
18
+ run: uv build
19
+
20
+ - name: Upload build artifacts
21
+ uses: actions/upload-artifact@v4
22
+ with:
23
+ name: dist
24
+ path: dist/
25
+
26
+ publish:
27
+ name: Publish to PyPI
28
+ needs: build
29
+ runs-on: ubuntu-latest
30
+ environment: pypi
31
+ permissions:
32
+ id-token: write # Required for trusted publishing via OIDC
33
+ steps:
34
+ - name: Download build artifacts
35
+ uses: actions/download-artifact@v4
36
+ with:
37
+ name: dist
38
+ path: dist/
39
+
40
+ - name: Publish to PyPI
41
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,20 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+ .vscode
12
+
13
+ config.yml
14
+ .env
15
+
16
+ # Logs
17
+ logs/
18
+
19
+ # Misc
20
+ .DS_Store
@@ -0,0 +1 @@
1
+ 3.12
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Prathamesh Ghatole
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,320 @@
1
+ Metadata-Version: 2.4
2
+ Name: entityxtract
3
+ Version: 0.5.2
4
+ Summary: A provider-agnostic, entity-centric LLM-powered document entity extraction tool
5
+ Project-URL: Homepage, https://github.com/Prathamesh-Ghatole/entityxtract
6
+ Project-URL: Repository, https://github.com/Prathamesh-Ghatole/entityxtract
7
+ Project-URL: Issues, https://github.com/Prathamesh-Ghatole/entityxtract/issues
8
+ Author-email: Prathamesh-Ghatole <prathamesh.s.ghatole@gmail.com>
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: ai,document,entity,extraction,llm,nlp,pdf,structured-data
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Requires-Python: >=3.12
20
+ Requires-Dist: fastapi[standard]>=0.116.1
21
+ Requires-Dist: langchain-openai>=0.3.32
22
+ Requires-Dist: langchain>=0.3.27
23
+ Requires-Dist: pillow>=11.3.0
24
+ Requires-Dist: polars>=1.33.0
25
+ Requires-Dist: pydantic>=2.11.7
26
+ Requires-Dist: pypdfium2>=4.30.0
27
+ Requires-Dist: python-dotenv>=1.1.1
28
+ Requires-Dist: requests>=2.32.5
29
+ Requires-Dist: xlsxwriter>=3.2.5
30
+ Description-Content-Type: text/markdown
31
+
32
+ <!-- <p align="center">
33
+ <a href="https://github.com/Prathamesh-Ghatole/entityxtract">
34
+ <img loading="lazy" alt="entityxtract" src="https://github.com/Prathamesh-Ghatole/entityxtract/raw/main/docs/assets/logo.png" width="50%"/>
35
+ </a>
36
+ </p> -->
37
+
38
+ # entityxtract
39
+
40
+ [![uv](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json)](https://github.com/astral-sh/uv)
41
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
42
+ [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev)
43
+ [![Python 3.12+](https://img.shields.io/badge/python-3.12+-blue.svg)](https://www.python.org/downloads/)
44
+ [![License MIT](https://img.shields.io/github/license/Prathamesh-Ghatole/entityxtract)](https://opensource.org/licenses/MIT)
45
+
46
+ **Entity-first, schema-driven extraction of structured data from unstructured documents** (PDF, DOCX, TXT, images). Define custom entities with schemas, few-shot examples, and instructions, then extract reliably using any local or SOTA LLM.
47
+
48
+ Built as an **open-source alternative** to Google Cloud Document AI, Azure AI Document Intelligence, and Adobe PDF Extract — but provider-agnostic and designed to work with any LLM.
49
+
50
+ <p align="center">
51
+ <a href="https://github.com/Prathamesh-Ghatole/entityxtract">
52
+ <img loading="lazy" alt="entityxtract" src="https://github.com/Prathamesh-Ghatole/entityxtract/raw/main/docs/assets/entityxtract_flow.png" width="100%"/>
53
+ </a>
54
+ </p>
55
+
56
+
57
+ ## Features
58
+
59
+ * 🎯 **Entity-first extraction** — Smart structured data extraction with pre-defined / auto-identified entities.
60
+ * 📄 **Multiple document formats** — Support for PDF, TXT, MD, and images.
61
+ * 🔀 **Smart input modes** — Extract information using text, OCR, or hybrid approaches.
62
+ * 🌐 **Provider-agnostic design** — Works with any LLM via OpenAI-compatible APIs.
63
+ * 🔄 **Robust execution** — Built-in retries, parallel extraction, strictly structured and typed output.
64
+ * 📊 **Observability** — Structured logs, token usage tracking, and optional cost tracking.
65
+ * 📦 **PyPI Package** — Easily install and use entityxtract in your projects.
66
+
67
+ ### Coming Soon
68
+
69
+ * 🌐 **FastAPI REST API** for remote extraction services.
70
+ * 🖥️ **Web UI** for visual entity/schema management and job monitoring.
71
+ * 🔍 **Auto-detect mode** to automatically identify extractable entities in documents.
72
+ * 💰 **Cost Optimization** using PDF annotation caching, and smart input data pruning.
73
+ * 👁️ **Deepseek OCR** integration for enhanced document processing.
74
+ * 🔌 **MCP server** for agentic applications.
75
+
76
+ ## Installation
77
+
78
+ To use entityxtract, you'll need Python 3.12+ and [uv](https://docs.astral.sh/uv/) (recommended):
79
+
80
+ ```bash
81
+ # Install uv if you haven't already
82
+ curl -LsSf https://astral.sh/uv/install.sh | sh
83
+
84
+ # Clone the repository
85
+ git clone https://github.com/Prathamesh-Ghatole/entityxtract.git
86
+ cd entityxtract
87
+
88
+ # Install dependencies
89
+ uv sync
90
+ ```
91
+
92
+ ## Getting Started
93
+
94
+ Extract pre-defined entities:
95
+
96
+ ```python
97
+ from pathlib import Path
98
+ import polars as pl
99
+ from entityxtract.extractor_types import (
100
+ Document, TableToExtract, ObjectsToExtract,
101
+ ExtractionConfig, FileInputMode
102
+ )
103
+ from entityxtract.extractor import extract_objects
104
+
105
+ # 1. Load your document
106
+ doc = Document(Path("document.pdf"))
107
+
108
+ # 2. Define what to extract
109
+ table = TableToExtract(
110
+ name="Events",
111
+ example_table=pl.DataFrame([
112
+ {"Time": "02:05", "Type": "Operation", "Description": "Example event"},
113
+ {"Time": "03:25", "Type": "Transit", "Description": "Another event"}
114
+ ]),
115
+ instructions="Extract the events table with Time, Type, and Description columns.",
116
+ required=True
117
+ )
118
+
119
+ # 3. Configure extraction
120
+ config = ExtractionConfig(
121
+ model_name="google/gemini-2.5-flash", # Recommended
122
+ temperature=0.0,
123
+ file_input_modes=[FileInputMode.FILE]
124
+ )
125
+
126
+ # 4. Extract!
127
+ results = extract_objects(doc, ObjectsToExtract(objects=[table], config=config))
128
+
129
+ # Use your results
130
+ for name, result in results.results.items():
131
+ if result.success:
132
+ df = pl.DataFrame(result.extracted_data)
133
+ print(df)
134
+ else:
135
+ print(f"Failed: {result.message}")
136
+ ```
137
+
138
+ ## Configuration
139
+
140
+ Copy the sample environment file `.env.sample` to `.env`, or set the following environment variables directly:
141
+
142
+ ```bash
143
+ # For all OpenAI-compatible endpoints [OpenAI, OpenRouter, Ollama, lm-studio, etc.]
144
+ export OPENAI_API_KEY="your-api-key"
145
+ export OPENAI_API_BASE="https://openrouter.ai/api/v1"
146
+
147
+ # Default model
148
+ export OPENAI_DEFAULT_MODEL="google/gemini-2.5-flash"
149
+ ```
150
+
151
+ ## Usage Examples
152
+
153
+ ### Complete Example with Multiple Entities
154
+
155
+ ```python
156
+ from pathlib import Path
157
+ import polars as pl
158
+
159
+ from entityxtract.extractor_types import (
160
+ Document, ExtractionConfig, FileInputMode,
161
+ TableToExtract, StringToExtract, ObjectsToExtract
162
+ )
163
+ from entityxtract.extractor import extract_objects
164
+
165
+ # Load document
166
+ doc = Document(Path("reports/quarterly_summary.pdf"))
167
+
168
+ # Define entities to extract
169
+ table = TableToExtract(
170
+ name="Financial Summary",
171
+ example_table=pl.DataFrame([
172
+ {"Quarter": "Q1 2024", "Revenue": "$1.2M", "Expenses": "$800K", "Profit": "$400K"},
173
+ {"Quarter": "Q2 2024", "Revenue": "$1.5M", "Expenses": "$900K", "Profit": "$600K"}
174
+ ]),
175
+ instructions="Extract the quarterly financial summary table with Quarter, Revenue, Expenses, and Profit columns.",
176
+ required=True
177
+ )
178
+
179
+ report_id = StringToExtract(
180
+ name="Report ID",
181
+ example_string="RPT-2024-Q2-001",
182
+ instructions="Extract the report identifier from the document header.",
183
+ required=False
184
+ )
185
+
186
+ # Configure extraction with cost tracking
187
+ config = ExtractionConfig(
188
+ model_name="google/gemini-2.5-flash",
189
+ temperature=0.0,
190
+ file_input_modes=[FileInputMode.FILE],
191
+ parallel_requests=4,
192
+ calculate_costs=True
193
+ )
194
+
195
+ # Run extraction
196
+ objects = ObjectsToExtract(objects=[table, report_id], config=config)
197
+ results = extract_objects(doc, objects)
198
+
199
+ # Process results
200
+ for name, res in results.results.items():
201
+ if res.success:
202
+ print(f"✓ [{name}] extracted successfully")
203
+ print(f" Tokens: {res.input_tokens} in / {res.output_tokens} out")
204
+ print(f" Cost: ${res.cost:.4f}")
205
+
206
+ # Export table to CSV
207
+ if isinstance(res.extracted_data, list):
208
+ df = pl.DataFrame(res.extracted_data)
209
+ df.write_csv(f"{name}.csv")
210
+ print(f" Saved to {name}.csv")
211
+ else:
212
+ print(f"✗ [{name}] failed: {res.message}")
213
+
214
+ print(f"\nTotals: {results.total_input_tokens} tokens in, {results.total_output_tokens} tokens out")
215
+ print(f"Total cost: ${results.total_cost:.4f}")
216
+ ```
217
+
218
+ ### Different Input Modes
219
+
220
+ ```python
221
+ # Pass document as file attachment
222
+ config = ExtractionConfig(
223
+ model_name="google/gemini-2.5-flash",
224
+ file_input_modes=[FileInputMode.FILE]
225
+ )
226
+
227
+ # Pass document as text content
228
+ config = ExtractionConfig(
229
+ model_name="google/gemini-2.5-flash",
230
+ file_input_modes=[FileInputMode.TEXT]
231
+ )
232
+
233
+ # Pass document as images (useful for scanned documents)
234
+ config = ExtractionConfig(
235
+ model_name="google/gemini-2.5-flash",
236
+ file_input_modes=[FileInputMode.IMAGE]
237
+ )
238
+
239
+ # Combine multiple input modes
240
+ config = ExtractionConfig(
241
+ model_name="google/gemini-2.5-flash",
242
+ file_input_modes=[FileInputMode.FILE, FileInputMode.TEXT]
243
+ )
244
+ ```
245
+
246
+ See `tests/test.py` for more complete examples.
247
+
248
+ ## Roadmap
249
+
250
+ ### Interfaces
251
+ - 🌐 FastAPI REST API for remote extraction services
252
+ - 🖥️ Web UI for entity management, job runs, and results review
253
+ - 🤖 Auto-detect mode: automatically identify entities in documents
254
+
255
+ ### Developer Experience
256
+ - 📦 Publish to PyPI for easy `pip install entityxtract`
257
+ - ⚡ ENV-first configuration (deprecate YAML)
258
+ - 💾 Document annotation caching to reduce token usage
259
+ - 🔧 JSON import/export for entity schemas and results
260
+ - 📝 Enhanced CLI with `entityxtract` command
261
+
262
+ ### Providers & Models
263
+ - 🏠 Local inference via Ollama
264
+ - 🔌 Native adapters for OpenAI, Gemini, Claude, and more
265
+ - 🌍 Support for additional LLM providers
266
+
267
+ ### Quality & Testing
268
+ - ✅ Expanded test coverage
269
+ - 📊 Benchmark suite for accuracy and performance
270
+ - 📚 Comprehensive documentation site
271
+
272
+ ## Comparisons
273
+
274
+ entityxtract positions itself as a flexible, open-source alternative to both commercial services and closed-source solutions:
275
+
276
+ **Key Differentiators:**
277
+ - **Provider Agnostic**: Works with any LLM, not locked to a single provider
278
+ - **Open Source**: Full transparency, customizable, and community-driven
279
+ - **Schema + Examples**: Strong emphasis on structured entity definitions with few-shot learning
280
+ - **Complete Stack**: Python SDK today, REST API and Web UI coming soon
281
+
282
+ ## Contributing
283
+
284
+ We welcome contributions! entityxtract uses modern Python tooling:
285
+
286
+ ```bash
287
+ # Use uv for environment management
288
+ uv sync
289
+
290
+ # Run tests
291
+ uv run pytest tests/
292
+
293
+ # Code formatting with Ruff
294
+ uv run ruff check .
295
+ uv run ruff format .
296
+ ```
297
+
298
+ **Guidelines:**
299
+ - Follow strict JSON output conventions
300
+ - Include tests for new features
301
+ - Update documentation as needed
302
+ - Use structured logging patterns
303
+
304
+ Open an issue or PR with a clear description and we'll be happy to review!
305
+
306
+ ## Get Help and Support
307
+
308
+ - 💬 [GitHub Discussions](https://github.com/Prathamesh-Ghatole/entityxtract/discussions) - Ask questions and share ideas
309
+ - 🐛 [Issues](https://github.com/Prathamesh-Ghatole/entityxtract/issues) - Report bugs or request features
310
+ - 📧 Contact: prathamesh.s.ghatole@gmail.com
311
+
312
+ ## License
313
+
314
+ entityxtract is released under the [MIT License](LICENSE). Free for commercial and personal use.
315
+
316
+ ---
317
+
318
+ **Built with ❤️ by [Prathamesh Ghatole](https://github.com/Prathamesh-Ghatole)**
319
+
320
+ *entityxtract was built out of the need for intelligent entity extraction from documents using AI with minimal effort. Define what you need, and let AI handle the rest.*