PyPI - textgleaner - Versions diffs - 1.2.0__tar.gz - Mend

textgleaner 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

textgleaner-1.2.0/LICENSE +21 -0
textgleaner-1.2.0/PKG-INFO +238 -0
textgleaner-1.2.0/README.md +206 -0
textgleaner-1.2.0/pyproject.toml +49 -0
textgleaner-1.2.0/setup.cfg +4 -0
textgleaner-1.2.0/tests/test_config.py +281 -0
textgleaner-1.2.0/tests/test_extractor.py +579 -0
textgleaner-1.2.0/tests/test_reporter.py +304 -0
textgleaner-1.2.0/tests/test_schema_generator.py +104 -0
textgleaner-1.2.0/tests/test_schema_refiner.py +259 -0
textgleaner-1.2.0/textgleaner/__init__.py +545 -0
textgleaner-1.2.0/textgleaner/cli.py +199 -0
textgleaner-1.2.0/textgleaner/config.py +24 -0
textgleaner-1.2.0/textgleaner/extractor.py +292 -0
textgleaner-1.2.0/textgleaner/llm_client.py +171 -0
textgleaner-1.2.0/textgleaner/reporter.py +246 -0
textgleaner-1.2.0/textgleaner/schema_generator.py +253 -0
textgleaner-1.2.0/textgleaner/schema_refiner.py +254 -0
textgleaner-1.2.0/textgleaner.egg-info/PKG-INFO +238 -0
textgleaner-1.2.0/textgleaner.egg-info/SOURCES.txt +22 -0
textgleaner-1.2.0/textgleaner.egg-info/dependency_links.txt +1 -0
textgleaner-1.2.0/textgleaner.egg-info/entry_points.txt +2 -0
textgleaner-1.2.0/textgleaner.egg-info/requires.txt +10 -0
textgleaner-1.2.0/textgleaner.egg-info/top_level.txt +1 -0

textgleaner-1.2.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Lyutenant
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

textgleaner-1.2.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,238 @@
+Metadata-Version: 2.4
+Name: textgleaner
+Version: 1.2.0
+Summary: Structured data extraction from plain-text documents using local LLM tool calls
+Author: Lyutenant
+License: MIT
+Project-URL: Homepage, https://github.com/Lyutenant/text-gleaner
+Project-URL: Repository, https://github.com/Lyutenant/text-gleaner
+Project-URL: Issues, https://github.com/Lyutenant/text-gleaner/issues
+Keywords: llm,extraction,ollama,nlp,text,structured-data
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Text Processing
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: typer>=0.12
+Requires-Dist: httpx>=0.27
+Requires-Dist: pydantic-settings>=2.0
+Requires-Dist: pyyaml>=6.0
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0; extra == "dev"
+Provides-Extra: excel
+Requires-Dist: openpyxl>=3.1; extra == "excel"
+Dynamic: license-file
+# textgleaner
+Extract structured data from plain-text documents using a local LLM.
+textgleaner uses a two-phase approach:
+1. **Generate schema** — the LLM analyzes sample documents and your description to produce a JSON extraction schema
+2. **Extract** — the LLM is forced to call the schema as a tool, returning deterministic, schema-validated JSON
+All inference runs locally via [Ollama](https://ollama.com). No data leaves your machine.
+---
+## Requirements
+- Python 3.10+
+- [Ollama](https://ollama.com) running locally (or on a remote host)
+- A model that supports tool calls (e.g. `qwen3:30b`, `llama3.1:8b`)
+---
+## Installation
+```bash
+pip install textgleaner
+```
+Or from source:
+```bash
+git clone https://github.com/Lyutenant/text-gleaner
+cd text-gleaner
+pip install -e .
+```
+---
+## Configuration
+Copy the example config and edit it:
+```bash
+cp config.example.yaml config.yaml
+```
+```yaml
+llm:
+  base_url: "http://localhost:11434"   # Ollama default
+  model: "qwen3:30b"
+  api_key: "local"
+  temperature: 0.2
+  max_tokens: 32768
+  timeout_seconds: 1800
+extraction:
+  confidence_scores: true
+  max_chars: 200000
+```
+You can also configure via environment variables:
+```bash
+export TEXTGLEANER__LLM__BASE_URL="http://localhost:11434"
+export TEXTGLEANER__LLM__MODEL="qwen3:30b"
+```
+---
+## CLI
+```bash
+# Phase 1: generate a schema from sample documents
+textgleaner generate-schema \
+  --samples sample1.txt sample2.txt \
+  --description description.yaml \
+  --output schema.json
+# Phase 2: extract structured data
+textgleaner extract \
+  --inputs statement.txt \
+  --schema schema.json \
+  --output result.json
+# Use a custom config file
+textgleaner --config myconfig.yaml extract --inputs doc.txt --schema schema.json
+```
+---
+## Python API
+### Quick start
+```python
+from textgleaner import Config, generate_schema, extract, Text
+# Load config from YAML
+cfg = Config.from_yaml("config.yaml")
+# Or set values directly
+cfg = Config(base_url="http://localhost:11434", model="qwen3:30b")
+# Phase 1: generate a schema
+schema = generate_schema(
+    samples=["jan.txt", "feb.txt"],
+    description="Monthly brokerage statement with holdings and transactions.",
+    output="schema.json",
+    config=cfg,
+)
+# Phase 2: extract from a single file
+result = extract("statement.txt", schema=schema, config=cfg)
+# Phase 2: extract from multiple files → {filename: dict}
+results = extract(["jan.txt", "feb.txt"], schema=schema, output="results.json", config=cfg)
+```
+### Sectionized extraction with `Text`
+Use `Text` to pass raw text slices directly — useful when you want to split a document before extracting:
+```python
+from textgleaner import Config, extract, Text
+cfg = Config.from_yaml("config.yaml")
+# Split a document on form-feed page breaks
+pages = open("statement.txt").read().split("\f")
+# Extract from a specific page range
+result = extract(
+    Text("".join(pages[4:8]), name="holdings"),
+    schema=holdings_schema,
+    config=cfg,
+)
+# Extract from multiple sections → {name: dict}
+results = extract(
+    [
+        Text(holdings_text, name="holdings"),
+        Text(activity_text, name="activity"),
+    ],
+    schema=schema,
+    config=cfg,
+)
+```
+### Confidence scores
+When `confidence_scores: true`, every extracted field has a sibling `<field>_confidence` (0–1):
+| Score | Meaning |
+|-------|---------|
+| 1.0 | Value stated verbatim |
+| 0.7 | Clearly implied |
+| 0.4 | Inferred / uncertain |
+| 0.0 | Not found (field is `null`) |
+---
+## How it works
+### Forced tool call
+In Phase 2, the schema is registered as an LLM tool and `tool_choice` is set to require it. The LLM must populate the tool's arguments — giving deterministic, schema-validated JSON output instead of free-form text.
+### Two-pass schema generation
+Phase 1 uses two LLM calls:
+1. **Structural analysis** — the LLM reads the sample text and produces a detailed plain-text analysis of sections, fields, data shapes, and nesting
+2. **Schema design** — a second call turns the analysis into a JSON tool definition
+Separating "understand the document" from "design the schema" produces more complete and correctly structured schemas.
+### Streaming to prevent timeouts
+All requests use HTTP streaming (`"stream": true`). Without streaming, Ollama generates the entire response server-side before sending a single byte — causing TCP timeouts on slow or remote connections before any data arrives. Streaming keeps the connection alive throughout generation.
+---
+## Input format
+**Input is always plain text.** PDF conversion, OCR, and any other pre-processing is your responsibility. Tools like `pdftotext` (poppler) work well for PDFs with selectable text.
+---
+## Known limitations
+- **Per-row detail degrades on long documents.** For dense tabular data (e.g. transaction histories), extract page-by-page or section-by-section rather than feeding the entire document at once. The model's attention weakens over long contexts.
+- **Local models only.** No cloud LLM integration is planned.
+---
+## Development
+```bash
+pip install -e .
+pytest tests/
+```
+---
+## License
+MIT

textgleaner-1.2.0/README.md ADDED Viewed

@@ -0,0 +1,206 @@
+# textgleaner
+Extract structured data from plain-text documents using a local LLM.
+textgleaner uses a two-phase approach:
+1. **Generate schema** — the LLM analyzes sample documents and your description to produce a JSON extraction schema
+2. **Extract** — the LLM is forced to call the schema as a tool, returning deterministic, schema-validated JSON
+All inference runs locally via [Ollama](https://ollama.com). No data leaves your machine.
+---
+## Requirements
+- Python 3.10+
+- [Ollama](https://ollama.com) running locally (or on a remote host)
+- A model that supports tool calls (e.g. `qwen3:30b`, `llama3.1:8b`)
+---
+## Installation
+```bash
+pip install textgleaner
+```
+Or from source:
+```bash
+git clone https://github.com/Lyutenant/text-gleaner
+cd text-gleaner
+pip install -e .
+```
+---
+## Configuration
+Copy the example config and edit it:
+```bash
+cp config.example.yaml config.yaml
+```
+```yaml
+llm:
+  base_url: "http://localhost:11434"   # Ollama default
+  model: "qwen3:30b"
+  api_key: "local"
+  temperature: 0.2
+  max_tokens: 32768
+  timeout_seconds: 1800
+extraction:
+  confidence_scores: true
+  max_chars: 200000
+```
+You can also configure via environment variables:
+```bash
+export TEXTGLEANER__LLM__BASE_URL="http://localhost:11434"
+export TEXTGLEANER__LLM__MODEL="qwen3:30b"
+```
+---
+## CLI
+```bash
+# Phase 1: generate a schema from sample documents
+textgleaner generate-schema \
+  --samples sample1.txt sample2.txt \
+  --description description.yaml \
+  --output schema.json
+# Phase 2: extract structured data
+textgleaner extract \
+  --inputs statement.txt \
+  --schema schema.json \
+  --output result.json
+# Use a custom config file
+textgleaner --config myconfig.yaml extract --inputs doc.txt --schema schema.json
+```
+---
+## Python API
+### Quick start
+```python
+from textgleaner import Config, generate_schema, extract, Text
+# Load config from YAML
+cfg = Config.from_yaml("config.yaml")
+# Or set values directly
+cfg = Config(base_url="http://localhost:11434", model="qwen3:30b")
+# Phase 1: generate a schema
+schema = generate_schema(
+    samples=["jan.txt", "feb.txt"],
+    description="Monthly brokerage statement with holdings and transactions.",
+    output="schema.json",
+    config=cfg,
+)
+# Phase 2: extract from a single file
+result = extract("statement.txt", schema=schema, config=cfg)
+# Phase 2: extract from multiple files → {filename: dict}
+results = extract(["jan.txt", "feb.txt"], schema=schema, output="results.json", config=cfg)
+```
+### Sectionized extraction with `Text`
+Use `Text` to pass raw text slices directly — useful when you want to split a document before extracting:
+```python
+from textgleaner import Config, extract, Text
+cfg = Config.from_yaml("config.yaml")
+# Split a document on form-feed page breaks
+pages = open("statement.txt").read().split("\f")
+# Extract from a specific page range
+result = extract(
+    Text("".join(pages[4:8]), name="holdings"),
+    schema=holdings_schema,
+    config=cfg,
+)
+# Extract from multiple sections → {name: dict}
+results = extract(
+    [
+        Text(holdings_text, name="holdings"),
+        Text(activity_text, name="activity"),
+    ],
+    schema=schema,
+    config=cfg,
+)
+```
+### Confidence scores
+When `confidence_scores: true`, every extracted field has a sibling `<field>_confidence` (0–1):
+| Score | Meaning |
+|-------|---------|
+| 1.0 | Value stated verbatim |
+| 0.7 | Clearly implied |
+| 0.4 | Inferred / uncertain |
+| 0.0 | Not found (field is `null`) |
+---
+## How it works
+### Forced tool call
+In Phase 2, the schema is registered as an LLM tool and `tool_choice` is set to require it. The LLM must populate the tool's arguments — giving deterministic, schema-validated JSON output instead of free-form text.
+### Two-pass schema generation
+Phase 1 uses two LLM calls:
+1. **Structural analysis** — the LLM reads the sample text and produces a detailed plain-text analysis of sections, fields, data shapes, and nesting
+2. **Schema design** — a second call turns the analysis into a JSON tool definition
+Separating "understand the document" from "design the schema" produces more complete and correctly structured schemas.
+### Streaming to prevent timeouts
+All requests use HTTP streaming (`"stream": true`). Without streaming, Ollama generates the entire response server-side before sending a single byte — causing TCP timeouts on slow or remote connections before any data arrives. Streaming keeps the connection alive throughout generation.
+---
+## Input format
+**Input is always plain text.** PDF conversion, OCR, and any other pre-processing is your responsibility. Tools like `pdftotext` (poppler) work well for PDFs with selectable text.
+---
+## Known limitations
+- **Per-row detail degrades on long documents.** For dense tabular data (e.g. transaction histories), extract page-by-page or section-by-section rather than feeding the entire document at once. The model's attention weakens over long contexts.
+- **Local models only.** No cloud LLM integration is planned.
+---
+## Development
+```bash
+pip install -e .
+pytest tests/
+```
+---
+## License
+MIT

textgleaner-1.2.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,49 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "textgleaner"
+version = "1.2.0"
+description = "Structured data extraction from plain-text documents using local LLM tool calls"
+readme = "README.md"
+authors = [{ name = "Lyutenant" }]
+license = { text = "MIT" }
+requires-python = ">=3.10"
+keywords = ["llm", "extraction", "ollama", "nlp", "text", "structured-data"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Text Processing",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+dependencies = [
+    "typer>=0.12",
+    "httpx>=0.27",
+    "pydantic-settings>=2.0",
+    "pyyaml>=6.0",
+]
+[project.urls]
+Homepage = "https://github.com/Lyutenant/text-gleaner"
+Repository = "https://github.com/Lyutenant/text-gleaner"
+Issues = "https://github.com/Lyutenant/text-gleaner/issues"
+[project.optional-dependencies]
+dev = ["pytest>=8.0"]
+excel = ["openpyxl>=3.1"]
+[project.scripts]
+textgleaner = "textgleaner.cli:app"
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["textgleaner*"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]

textgleaner-1.2.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0