textgleaner 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Lyutenant
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,238 @@
1
+ Metadata-Version: 2.4
2
+ Name: textgleaner
3
+ Version: 1.2.0
4
+ Summary: Structured data extraction from plain-text documents using local LLM tool calls
5
+ Author: Lyutenant
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/Lyutenant/text-gleaner
8
+ Project-URL: Repository, https://github.com/Lyutenant/text-gleaner
9
+ Project-URL: Issues, https://github.com/Lyutenant/text-gleaner/issues
10
+ Keywords: llm,extraction,ollama,nlp,text,structured-data
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Text Processing
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: typer>=0.12
24
+ Requires-Dist: httpx>=0.27
25
+ Requires-Dist: pydantic-settings>=2.0
26
+ Requires-Dist: pyyaml>=6.0
27
+ Provides-Extra: dev
28
+ Requires-Dist: pytest>=8.0; extra == "dev"
29
+ Provides-Extra: excel
30
+ Requires-Dist: openpyxl>=3.1; extra == "excel"
31
+ Dynamic: license-file
32
+
33
+ # textgleaner
34
+
35
+ Extract structured data from plain-text documents using a local LLM.
36
+
37
+ textgleaner uses a two-phase approach:
38
+
39
+ 1. **Generate schema** — the LLM analyzes sample documents and your description to produce a JSON extraction schema
40
+ 2. **Extract** — the LLM is forced to call the schema as a tool, returning deterministic, schema-validated JSON
41
+
42
+ All inference runs locally via [Ollama](https://ollama.com). No data leaves your machine.
43
+
44
+ ---
45
+
46
+ ## Requirements
47
+
48
+ - Python 3.10+
49
+ - [Ollama](https://ollama.com) running locally (or on a remote host)
50
+ - A model that supports tool calls (e.g. `qwen3:30b`, `llama3.1:8b`)
51
+
52
+ ---
53
+
54
+ ## Installation
55
+
56
+ ```bash
57
+ pip install textgleaner
58
+ ```
59
+
60
+ Or from source:
61
+
62
+ ```bash
63
+ git clone https://github.com/Lyutenant/text-gleaner
64
+ cd text-gleaner
65
+ pip install -e .
66
+ ```
67
+
68
+ ---
69
+
70
+ ## Configuration
71
+
72
+ Copy the example config and edit it:
73
+
74
+ ```bash
75
+ cp config.example.yaml config.yaml
76
+ ```
77
+
78
+ ```yaml
79
+ llm:
80
+ base_url: "http://localhost:11434" # Ollama default
81
+ model: "qwen3:30b"
82
+ api_key: "local"
83
+ temperature: 0.2
84
+ max_tokens: 32768
85
+ timeout_seconds: 1800
86
+
87
+ extraction:
88
+ confidence_scores: true
89
+ max_chars: 200000
90
+ ```
91
+
92
+ You can also configure via environment variables:
93
+
94
+ ```bash
95
+ export TEXTGLEANER__LLM__BASE_URL="http://localhost:11434"
96
+ export TEXTGLEANER__LLM__MODEL="qwen3:30b"
97
+ ```
98
+
99
+ ---
100
+
101
+ ## CLI
102
+
103
+ ```bash
104
+ # Phase 1: generate a schema from sample documents
105
+ textgleaner generate-schema \
106
+ --samples sample1.txt sample2.txt \
107
+ --description description.yaml \
108
+ --output schema.json
109
+
110
+ # Phase 2: extract structured data
111
+ textgleaner extract \
112
+ --inputs statement.txt \
113
+ --schema schema.json \
114
+ --output result.json
115
+
116
+ # Use a custom config file
117
+ textgleaner --config myconfig.yaml extract --inputs doc.txt --schema schema.json
118
+ ```
119
+
120
+ ---
121
+
122
+ ## Python API
123
+
124
+ ### Quick start
125
+
126
+ ```python
127
+ from textgleaner import Config, generate_schema, extract, Text
128
+
129
+ # Load config from YAML
130
+ cfg = Config.from_yaml("config.yaml")
131
+
132
+ # Or set values directly
133
+ cfg = Config(base_url="http://localhost:11434", model="qwen3:30b")
134
+
135
+ # Phase 1: generate a schema
136
+ schema = generate_schema(
137
+ samples=["jan.txt", "feb.txt"],
138
+ description="Monthly brokerage statement with holdings and transactions.",
139
+ output="schema.json",
140
+ config=cfg,
141
+ )
142
+
143
+ # Phase 2: extract from a single file
144
+ result = extract("statement.txt", schema=schema, config=cfg)
145
+
146
+ # Phase 2: extract from multiple files → {filename: dict}
147
+ results = extract(["jan.txt", "feb.txt"], schema=schema, output="results.json", config=cfg)
148
+ ```
149
+
150
+ ### Sectionized extraction with `Text`
151
+
152
+ Use `Text` to pass raw text slices directly — useful when you want to split a document before extracting:
153
+
154
+ ```python
155
+ from textgleaner import Config, extract, Text
156
+
157
+ cfg = Config.from_yaml("config.yaml")
158
+
159
+ # Split a document on form-feed page breaks
160
+ pages = open("statement.txt").read().split("\f")
161
+
162
+ # Extract from a specific page range
163
+ result = extract(
164
+ Text("".join(pages[4:8]), name="holdings"),
165
+ schema=holdings_schema,
166
+ config=cfg,
167
+ )
168
+
169
+ # Extract from multiple sections → {name: dict}
170
+ results = extract(
171
+ [
172
+ Text(holdings_text, name="holdings"),
173
+ Text(activity_text, name="activity"),
174
+ ],
175
+ schema=schema,
176
+ config=cfg,
177
+ )
178
+ ```
179
+
180
+ ### Confidence scores
181
+
182
+ When `confidence_scores: true`, every extracted field has a sibling `<field>_confidence` (0–1):
183
+
184
+ | Score | Meaning |
185
+ |-------|---------|
186
+ | 1.0 | Value stated verbatim |
187
+ | 0.7 | Clearly implied |
188
+ | 0.4 | Inferred / uncertain |
189
+ | 0.0 | Not found (field is `null`) |
190
+
191
+ ---
192
+
193
+ ## How it works
194
+
195
+ ### Forced tool call
196
+
197
+ In Phase 2, the schema is registered as an LLM tool and `tool_choice` is set to require it. The LLM must populate the tool's arguments — giving deterministic, schema-validated JSON output instead of free-form text.
198
+
199
+ ### Two-pass schema generation
200
+
201
+ Phase 1 uses two LLM calls:
202
+
203
+ 1. **Structural analysis** — the LLM reads the sample text and produces a detailed plain-text analysis of sections, fields, data shapes, and nesting
204
+ 2. **Schema design** — a second call turns the analysis into a JSON tool definition
205
+
206
+ Separating "understand the document" from "design the schema" produces more complete and correctly structured schemas.
207
+
208
+ ### Streaming to prevent timeouts
209
+
210
+ All requests use HTTP streaming (`"stream": true`). Without streaming, Ollama generates the entire response server-side before sending a single byte — causing TCP timeouts on slow or remote connections before any data arrives. Streaming keeps the connection alive throughout generation.
211
+
212
+ ---
213
+
214
+ ## Input format
215
+
216
+ **Input is always plain text.** PDF conversion, OCR, and any other pre-processing is your responsibility. Tools like `pdftotext` (poppler) work well for PDFs with selectable text.
217
+
218
+ ---
219
+
220
+ ## Known limitations
221
+
222
+ - **Per-row detail degrades on long documents.** For dense tabular data (e.g. transaction histories), extract page-by-page or section-by-section rather than feeding the entire document at once. The model's attention weakens over long contexts.
223
+ - **Local models only.** No cloud LLM integration is planned.
224
+
225
+ ---
226
+
227
+ ## Development
228
+
229
+ ```bash
230
+ pip install -e .
231
+ pytest tests/
232
+ ```
233
+
234
+ ---
235
+
236
+ ## License
237
+
238
+ MIT
@@ -0,0 +1,206 @@
1
+ # textgleaner
2
+
3
+ Extract structured data from plain-text documents using a local LLM.
4
+
5
+ textgleaner uses a two-phase approach:
6
+
7
+ 1. **Generate schema** — the LLM analyzes sample documents and your description to produce a JSON extraction schema
8
+ 2. **Extract** — the LLM is forced to call the schema as a tool, returning deterministic, schema-validated JSON
9
+
10
+ All inference runs locally via [Ollama](https://ollama.com). No data leaves your machine.
11
+
12
+ ---
13
+
14
+ ## Requirements
15
+
16
+ - Python 3.10+
17
+ - [Ollama](https://ollama.com) running locally (or on a remote host)
18
+ - A model that supports tool calls (e.g. `qwen3:30b`, `llama3.1:8b`)
19
+
20
+ ---
21
+
22
+ ## Installation
23
+
24
+ ```bash
25
+ pip install textgleaner
26
+ ```
27
+
28
+ Or from source:
29
+
30
+ ```bash
31
+ git clone https://github.com/Lyutenant/text-gleaner
32
+ cd text-gleaner
33
+ pip install -e .
34
+ ```
35
+
36
+ ---
37
+
38
+ ## Configuration
39
+
40
+ Copy the example config and edit it:
41
+
42
+ ```bash
43
+ cp config.example.yaml config.yaml
44
+ ```
45
+
46
+ ```yaml
47
+ llm:
48
+ base_url: "http://localhost:11434" # Ollama default
49
+ model: "qwen3:30b"
50
+ api_key: "local"
51
+ temperature: 0.2
52
+ max_tokens: 32768
53
+ timeout_seconds: 1800
54
+
55
+ extraction:
56
+ confidence_scores: true
57
+ max_chars: 200000
58
+ ```
59
+
60
+ You can also configure via environment variables:
61
+
62
+ ```bash
63
+ export TEXTGLEANER__LLM__BASE_URL="http://localhost:11434"
64
+ export TEXTGLEANER__LLM__MODEL="qwen3:30b"
65
+ ```
66
+
67
+ ---
68
+
69
+ ## CLI
70
+
71
+ ```bash
72
+ # Phase 1: generate a schema from sample documents
73
+ textgleaner generate-schema \
74
+ --samples sample1.txt sample2.txt \
75
+ --description description.yaml \
76
+ --output schema.json
77
+
78
+ # Phase 2: extract structured data
79
+ textgleaner extract \
80
+ --inputs statement.txt \
81
+ --schema schema.json \
82
+ --output result.json
83
+
84
+ # Use a custom config file
85
+ textgleaner --config myconfig.yaml extract --inputs doc.txt --schema schema.json
86
+ ```
87
+
88
+ ---
89
+
90
+ ## Python API
91
+
92
+ ### Quick start
93
+
94
+ ```python
95
+ from textgleaner import Config, generate_schema, extract, Text
96
+
97
+ # Load config from YAML
98
+ cfg = Config.from_yaml("config.yaml")
99
+
100
+ # Or set values directly
101
+ cfg = Config(base_url="http://localhost:11434", model="qwen3:30b")
102
+
103
+ # Phase 1: generate a schema
104
+ schema = generate_schema(
105
+ samples=["jan.txt", "feb.txt"],
106
+ description="Monthly brokerage statement with holdings and transactions.",
107
+ output="schema.json",
108
+ config=cfg,
109
+ )
110
+
111
+ # Phase 2: extract from a single file
112
+ result = extract("statement.txt", schema=schema, config=cfg)
113
+
114
+ # Phase 2: extract from multiple files → {filename: dict}
115
+ results = extract(["jan.txt", "feb.txt"], schema=schema, output="results.json", config=cfg)
116
+ ```
117
+
118
+ ### Sectionized extraction with `Text`
119
+
120
+ Use `Text` to pass raw text slices directly — useful when you want to split a document before extracting:
121
+
122
+ ```python
123
+ from textgleaner import Config, extract, Text
124
+
125
+ cfg = Config.from_yaml("config.yaml")
126
+
127
+ # Split a document on form-feed page breaks
128
+ pages = open("statement.txt").read().split("\f")
129
+
130
+ # Extract from a specific page range
131
+ result = extract(
132
+ Text("".join(pages[4:8]), name="holdings"),
133
+ schema=holdings_schema,
134
+ config=cfg,
135
+ )
136
+
137
+ # Extract from multiple sections → {name: dict}
138
+ results = extract(
139
+ [
140
+ Text(holdings_text, name="holdings"),
141
+ Text(activity_text, name="activity"),
142
+ ],
143
+ schema=schema,
144
+ config=cfg,
145
+ )
146
+ ```
147
+
148
+ ### Confidence scores
149
+
150
+ When `confidence_scores: true`, every extracted field has a sibling `<field>_confidence` (0–1):
151
+
152
+ | Score | Meaning |
153
+ |-------|---------|
154
+ | 1.0 | Value stated verbatim |
155
+ | 0.7 | Clearly implied |
156
+ | 0.4 | Inferred / uncertain |
157
+ | 0.0 | Not found (field is `null`) |
158
+
159
+ ---
160
+
161
+ ## How it works
162
+
163
+ ### Forced tool call
164
+
165
+ In Phase 2, the schema is registered as an LLM tool and `tool_choice` is set to require it. The LLM must populate the tool's arguments — giving deterministic, schema-validated JSON output instead of free-form text.
166
+
167
+ ### Two-pass schema generation
168
+
169
+ Phase 1 uses two LLM calls:
170
+
171
+ 1. **Structural analysis** — the LLM reads the sample text and produces a detailed plain-text analysis of sections, fields, data shapes, and nesting
172
+ 2. **Schema design** — a second call turns the analysis into a JSON tool definition
173
+
174
+ Separating "understand the document" from "design the schema" produces more complete and correctly structured schemas.
175
+
176
+ ### Streaming to prevent timeouts
177
+
178
+ All requests use HTTP streaming (`"stream": true`). Without streaming, Ollama generates the entire response server-side before sending a single byte — causing TCP timeouts on slow or remote connections before any data arrives. Streaming keeps the connection alive throughout generation.
179
+
180
+ ---
181
+
182
+ ## Input format
183
+
184
+ **Input is always plain text.** PDF conversion, OCR, and any other pre-processing is your responsibility. Tools like `pdftotext` (poppler) work well for PDFs with selectable text.
185
+
186
+ ---
187
+
188
+ ## Known limitations
189
+
190
+ - **Per-row detail degrades on long documents.** For dense tabular data (e.g. transaction histories), extract page-by-page or section-by-section rather than feeding the entire document at once. The model's attention weakens over long contexts.
191
+ - **Local models only.** No cloud LLM integration is planned.
192
+
193
+ ---
194
+
195
+ ## Development
196
+
197
+ ```bash
198
+ pip install -e .
199
+ pytest tests/
200
+ ```
201
+
202
+ ---
203
+
204
+ ## License
205
+
206
+ MIT
@@ -0,0 +1,49 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "textgleaner"
7
+ version = "1.2.0"
8
+ description = "Structured data extraction from plain-text documents using local LLM tool calls"
9
+ readme = "README.md"
10
+ authors = [{ name = "Lyutenant" }]
11
+ license = { text = "MIT" }
12
+ requires-python = ">=3.10"
13
+ keywords = ["llm", "extraction", "ollama", "nlp", "text", "structured-data"]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.10",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Topic :: Text Processing",
23
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
24
+ ]
25
+ dependencies = [
26
+ "typer>=0.12",
27
+ "httpx>=0.27",
28
+ "pydantic-settings>=2.0",
29
+ "pyyaml>=6.0",
30
+ ]
31
+
32
+ [project.urls]
33
+ Homepage = "https://github.com/Lyutenant/text-gleaner"
34
+ Repository = "https://github.com/Lyutenant/text-gleaner"
35
+ Issues = "https://github.com/Lyutenant/text-gleaner/issues"
36
+
37
+ [project.optional-dependencies]
38
+ dev = ["pytest>=8.0"]
39
+ excel = ["openpyxl>=3.1"]
40
+
41
+ [project.scripts]
42
+ textgleaner = "textgleaner.cli:app"
43
+
44
+ [tool.setuptools.packages.find]
45
+ where = ["."]
46
+ include = ["textgleaner*"]
47
+
48
+ [tool.pytest.ini_options]
49
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+