textgleaner 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,545 @@
1
+ from __future__ import annotations
2
+ import json as _json
3
+ from pathlib import Path
4
+ from typing import Callable, Union
5
+
6
+ from .schema_generator import generate_schema as _generate_schema
7
+ from .schema_refiner import refine_schema as _refine_schema
8
+ from .extractor import extract as _extract
9
+ from .reporter import (
10
+ summarize as _summarize,
11
+ write_csv, write_excel, write_summary_csv,
12
+ build_validation_report, format_validation_report,
13
+ )
14
+
15
+ PathLike = Union[str, Path]
16
+
17
+
18
+ class Config:
19
+ """Holds LLM and extraction configuration for textgleaner.
20
+
21
+ Values set here take priority over environment variables, and are
22
+ overridden by any explicit kwargs passed directly to :func:`extract`
23
+ or :func:`generate_schema`.
24
+
25
+ Usage::
26
+
27
+ from textgleaner import Config, extract
28
+
29
+ # Load from a YAML file
30
+ cfg = Config.from_yaml("config.yaml")
31
+
32
+ # Or set values directly in code
33
+ cfg = Config(base_url="http://myserver:11434", model="qwen3:30b")
34
+
35
+ # Pass to functions — replaces 6 individual kwargs
36
+ result = extract("doc.txt", schema=schema, config=cfg)
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ *,
42
+ base_url: Union[str, None] = None,
43
+ model: Union[str, None] = None,
44
+ api_key: Union[str, None] = None,
45
+ temperature: Union[float, None] = None,
46
+ max_tokens: Union[int, None] = None,
47
+ timeout: Union[int, None] = None,
48
+ confidence_scores: Union[bool, None] = None,
49
+ max_chars: Union[int, None] = None,
50
+ extraction_method: Union[str, None] = None,
51
+ confidence_retry: Union[bool, None] = None,
52
+ model_profile: Union[str, None] = None,
53
+ ):
54
+ self.base_url = base_url
55
+ self.model = model
56
+ self.api_key = api_key
57
+ self.temperature = temperature
58
+ self.max_tokens = max_tokens
59
+ self.timeout = timeout
60
+ self.confidence_scores = confidence_scores
61
+ self.max_chars = max_chars
62
+ self.extraction_method = extraction_method
63
+ self.confidence_retry = confidence_retry
64
+ self.model_profile = model_profile
65
+
66
+ @classmethod
67
+ def from_yaml(cls, path: PathLike) -> "Config":
68
+ """Load configuration from a YAML file.
69
+
70
+ The YAML should follow this structure::
71
+
72
+ llm:
73
+ base_url: "http://localhost:11434"
74
+ model: "qwen3:30b"
75
+ api_key: "local"
76
+ temperature: 0.2
77
+ max_tokens: 32768
78
+ timeout_seconds: 1800
79
+
80
+ extraction:
81
+ confidence_scores: true
82
+ max_chars: 200000
83
+ extraction_method: tool_call # tool_call | structured_output | auto
84
+
85
+ Args:
86
+ path: Path to the YAML config file. Raises :exc:`FileNotFoundError`
87
+ if the file does not exist.
88
+ """
89
+ import yaml
90
+ p = Path(path)
91
+ if not p.exists():
92
+ raise FileNotFoundError(f"Config file not found: {p}")
93
+ with p.open() as f:
94
+ data = yaml.safe_load(f) or {}
95
+ llm = data.get("llm", {})
96
+ ext = data.get("extraction", {})
97
+ return cls(
98
+ base_url=llm.get("base_url"),
99
+ model=llm.get("model"),
100
+ api_key=llm.get("api_key"),
101
+ temperature=llm.get("temperature"),
102
+ max_tokens=llm.get("max_tokens"),
103
+ timeout=llm.get("timeout_seconds"),
104
+ confidence_scores=ext.get("confidence_scores"),
105
+ max_chars=ext.get("max_chars"),
106
+ extraction_method=ext.get("extraction_method"),
107
+ confidence_retry=ext.get("confidence_retry"),
108
+ model_profile=llm.get("model_profile"),
109
+ )
110
+
111
+
112
+ class Text:
113
+ """Wraps a raw text string for use as input to extract() or generate_schema().
114
+
115
+ Use this when you want to pass document text directly rather than a file path —
116
+ for example, when feeding a pre-sliced section of a larger document.
117
+
118
+ Args:
119
+ content: The plain-text content to extract from or use as a sample.
120
+ name: A human-readable label used as the key in multi-input results and
121
+ in log messages. Defaults to "<text>".
122
+
123
+ Example::
124
+
125
+ from textgleaner import extract, Text
126
+
127
+ pages = document_text.split("\\f") # split on form-feed / page break
128
+ holdings_text = "\\n".join(pages[4:8]) # pages 5-8
129
+
130
+ result = extract(Text(holdings_text, name="holdings"), schema=holdings_schema)
131
+ """
132
+
133
+ def __init__(self, content: str, name: str = "<text>"):
134
+ self.content = content
135
+ self.name = name
136
+
137
+
138
+ # Internal type: a resolved (text, label) pair ready for the core functions.
139
+ _TextPair = tuple[str, str]
140
+
141
+
142
+ def _merge_config(config: Union[Config, None], **kwargs) -> dict:
143
+ """Merge a Config object with explicit kwargs. Explicit kwargs take priority."""
144
+ merged: dict = {}
145
+ if config is not None:
146
+ for attr in (
147
+ "base_url", "model", "api_key", "temperature",
148
+ "max_tokens", "timeout", "confidence_scores", "max_chars",
149
+ "extraction_method", "confidence_retry", "model_profile",
150
+ ):
151
+ val = getattr(config, attr, None)
152
+ if val is not None:
153
+ merged[attr] = val
154
+ for k, v in kwargs.items():
155
+ if v is not None:
156
+ merged[k] = v
157
+ return merged
158
+
159
+
160
+ def generate_schema(
161
+ samples: Union[PathLike, Text, list[Union[PathLike, Text]]],
162
+ description: Union[str, PathLike],
163
+ output: Union[PathLike, None] = None,
164
+ *,
165
+ config: Union[Config, None] = None,
166
+ confidence_scores: Union[bool, None] = None,
167
+ model_profile: Union[str, None] = None,
168
+ base_url: Union[str, None] = None,
169
+ model: Union[str, None] = None,
170
+ api_key: Union[str, None] = None,
171
+ temperature: Union[float, None] = None,
172
+ max_tokens: Union[int, None] = None,
173
+ timeout: Union[int, None] = None,
174
+ ) -> dict:
175
+ """
176
+ Phase 1: Generate a JSON extraction schema from sample documents.
177
+
178
+ Args:
179
+ samples: One or more sample documents. Each can be a file path (str or
180
+ Path) or a :class:`Text` instance containing raw text.
181
+ description: Either a raw string describing the document type and fields,
182
+ or a path to a .yaml / .md description file.
183
+ output: Optional path to write the schema JSON. If omitted, schema is
184
+ returned but not saved.
185
+ config: A :class:`Config` instance (from ``Config.from_yaml()`` or
186
+ ``Config(...)``). Individual kwargs below override config values.
187
+ confidence_scores: Include _confidence sibling fields in the schema.
188
+ Overrides config. Defaults to TEXTGLEANER__EXTRACTION__CONFIDENCE_SCORES
189
+ env var, or True.
190
+ base_url: LLM server base URL. Overrides config. Defaults to TEXTGLEANER__LLM__BASE_URL.
191
+ model: Model name. Overrides config. Defaults to TEXTGLEANER__LLM__MODEL.
192
+ api_key: API key. Overrides config. Defaults to TEXTGLEANER__LLM__API_KEY.
193
+ temperature: Sampling temperature. Overrides config. Defaults to TEXTGLEANER__LLM__TEMPERATURE.
194
+ max_tokens: Max tokens to generate. Overrides config. Defaults to TEXTGLEANER__LLM__MAX_TOKENS.
195
+ timeout: Request timeout in seconds. Overrides config. Defaults to TEXTGLEANER__LLM__TIMEOUT_SECONDS.
196
+
197
+ Returns:
198
+ The generated schema as a dict.
199
+ """
200
+ if not isinstance(samples, list):
201
+ samples = [samples]
202
+ sample_pairs = [_resolve_input(s) for s in samples]
203
+ desc_str = _resolve_description(description)
204
+ out_path = Path(output) if output is not None else None
205
+
206
+ resolved = _merge_config(
207
+ config,
208
+ confidence_scores=confidence_scores,
209
+ model_profile=model_profile,
210
+ base_url=base_url,
211
+ model=model,
212
+ api_key=api_key,
213
+ temperature=temperature,
214
+ max_tokens=max_tokens,
215
+ timeout=timeout,
216
+ )
217
+ return _generate_schema(
218
+ sample_pairs,
219
+ desc_str,
220
+ out_path,
221
+ confidence_scores=resolved.get("confidence_scores"),
222
+ model_profile=resolved.get("model_profile"),
223
+ base_url=resolved.get("base_url"),
224
+ model=resolved.get("model"),
225
+ api_key=resolved.get("api_key"),
226
+ temperature=resolved.get("temperature"),
227
+ max_tokens=resolved.get("max_tokens"),
228
+ timeout=resolved.get("timeout"),
229
+ )
230
+
231
+
232
+ def refine_schema(
233
+ schema: Union[dict, PathLike],
234
+ samples: Union[PathLike, Text, list[Union[PathLike, Text]]],
235
+ output: Union[PathLike, None] = None,
236
+ *,
237
+ config: Union[Config, None] = None,
238
+ confidence_scores: Union[bool, None] = None,
239
+ model_profile: Union[str, None] = None,
240
+ base_url: Union[str, None] = None,
241
+ model: Union[str, None] = None,
242
+ api_key: Union[str, None] = None,
243
+ temperature: Union[float, None] = None,
244
+ max_tokens: Union[int, None] = None,
245
+ timeout: Union[int, None] = None,
246
+ ) -> dict:
247
+ """
248
+ Update an existing schema from new sample documents without re-running Phase 1 from scratch.
249
+
250
+ Runs a two-pass refinement:
251
+
252
+ * **Pass 1** — gap analysis: the LLM compares the new samples against the existing
253
+ schema and identifies missing fields, type mismatches, structural issues, and
254
+ description improvements.
255
+ * **Pass 2** — schema update: the LLM produces the complete updated schema JSON,
256
+ preserving all existing fields unless the gap analysis recommends a change.
257
+
258
+ Confidence score fields (``<field>_confidence`` siblings) are automatically
259
+ detected from the existing schema and added for any new fields.
260
+
261
+ Args:
262
+ schema: The existing schema as a dict or a path to a ``.json`` file.
263
+ samples: One or more new sample documents (file paths or :class:`Text` instances).
264
+ output: Optional path to write the updated schema. If omitted, the schema is
265
+ returned but not saved.
266
+ config: :class:`Config` instance.
267
+ confidence_scores: Override auto-detection of whether to include confidence
268
+ fields for new properties.
269
+ base_url, model, api_key, temperature, max_tokens, timeout:
270
+ LLM overrides (same as :func:`generate_schema`).
271
+
272
+ Returns:
273
+ The updated schema dict.
274
+ """
275
+ if isinstance(schema, dict):
276
+ schema_dict = schema
277
+ else:
278
+ with open(schema) as f:
279
+ schema_dict = _json.load(f)
280
+
281
+ if not isinstance(samples, list):
282
+ samples = [samples]
283
+ sample_pairs = [_resolve_input(s) for s in samples]
284
+ out_path = Path(output) if output is not None else None
285
+
286
+ resolved = _merge_config(
287
+ config,
288
+ confidence_scores=confidence_scores,
289
+ model_profile=model_profile,
290
+ base_url=base_url,
291
+ model=model,
292
+ api_key=api_key,
293
+ temperature=temperature,
294
+ max_tokens=max_tokens,
295
+ timeout=timeout,
296
+ )
297
+ return _refine_schema(
298
+ schema_dict,
299
+ sample_pairs,
300
+ out_path,
301
+ confidence_scores=resolved.get("confidence_scores"),
302
+ model_profile=resolved.get("model_profile"),
303
+ base_url=resolved.get("base_url"),
304
+ model=resolved.get("model"),
305
+ api_key=resolved.get("api_key"),
306
+ temperature=resolved.get("temperature"),
307
+ max_tokens=resolved.get("max_tokens"),
308
+ timeout=resolved.get("timeout"),
309
+ )
310
+
311
+
312
+ def extract(
313
+ inputs: Union[PathLike, Text, list[Union[PathLike, Text]]],
314
+ schema: Union[dict, PathLike],
315
+ output: Union[PathLike, None] = None,
316
+ max_chars: Union[int, None] = None,
317
+ *,
318
+ config: Union[Config, None] = None,
319
+ on_result: Union[Callable[[str, dict], None], None] = None,
320
+ extraction_method: Union[str, None] = None,
321
+ confidence_retry: Union[bool, None] = None,
322
+ model_profile: Union[str, None] = None,
323
+ base_url: Union[str, None] = None,
324
+ model: Union[str, None] = None,
325
+ api_key: Union[str, None] = None,
326
+ temperature: Union[float, None] = None,
327
+ max_tokens: Union[int, None] = None,
328
+ timeout: Union[int, None] = None,
329
+ ) -> dict:
330
+ """
331
+ Phase 2: Extract structured data from one or more documents using a schema.
332
+
333
+ Args:
334
+ inputs: One or more documents to extract from. Each can be a file path
335
+ (str or Path) or a :class:`Text` instance containing raw text.
336
+ schema: The extraction schema as a dict, or a path to a schema .json file.
337
+ output: Optional path to write the result JSON.
338
+ max_chars: Max characters per input before raising an error. Overrides
339
+ config. Defaults to TEXTGLEANER__EXTRACTION__MAX_CHARS, or 200,000.
340
+ Set to 0 to disable the limit.
341
+ config: A :class:`Config` instance (from ``Config.from_yaml()`` or
342
+ ``Config(...)``). Individual kwargs below override config values.
343
+ extraction_method: ``"tool_call"`` (default), ``"structured_output"``, or
344
+ ``"auto"``. Overrides config.
345
+ ``tool_call`` — forces a function/tool call; falls back to
346
+ content JSON if the model ignores tool_choice.
347
+ ``structured_output`` — uses ``response_format`` with
348
+ ``json_schema`` (grammar-constrained decoding); works with
349
+ models that handle tool_choice poorly.
350
+ ``auto`` — tries ``tool_call`` first; falls back to
351
+ ``structured_output`` if the model returns unparseable
352
+ output or the server returns HTTP 400/422.
353
+ base_url: LLM server base URL. Overrides config. Defaults to TEXTGLEANER__LLM__BASE_URL.
354
+ model: Model name. Overrides config. Defaults to TEXTGLEANER__LLM__MODEL.
355
+ api_key: API key. Overrides config. Defaults to TEXTGLEANER__LLM__API_KEY.
356
+ temperature: Sampling temperature. Overrides config. Defaults to TEXTGLEANER__LLM__TEMPERATURE.
357
+ max_tokens: Max tokens to generate. Overrides config. Defaults to TEXTGLEANER__LLM__MAX_TOKENS.
358
+ timeout: Request timeout in seconds. Overrides config. Defaults to TEXTGLEANER__LLM__TIMEOUT_SECONDS.
359
+
360
+ Returns:
361
+ For a single input: the extracted data dict.
362
+ For multiple inputs: {name: extracted_data_dict, ...}
363
+ """
364
+ if isinstance(inputs, (str, Path, Text)):
365
+ single = True
366
+ input_pairs = [_resolve_input(inputs)]
367
+ else:
368
+ single = False
369
+ input_pairs = [_resolve_input(i) for i in inputs]
370
+
371
+ if isinstance(schema, dict):
372
+ schema_dict = schema
373
+ else:
374
+ with open(schema) as f:
375
+ schema_dict = _json.load(f)
376
+
377
+ out_path = Path(output) if output is not None else None
378
+ non_json = out_path is not None and out_path.suffix in (".csv", ".xlsx")
379
+
380
+ resolved = _merge_config(
381
+ config,
382
+ max_chars=max_chars,
383
+ extraction_method=extraction_method,
384
+ confidence_retry=confidence_retry,
385
+ model_profile=model_profile,
386
+ base_url=base_url,
387
+ model=model,
388
+ api_key=api_key,
389
+ temperature=temperature,
390
+ max_tokens=max_tokens,
391
+ timeout=timeout,
392
+ )
393
+ # For CSV/Excel we collect results first, then write ourselves.
394
+ # For JSON we let _extract() write directly.
395
+ results = _extract(
396
+ input_pairs,
397
+ schema_dict,
398
+ None if non_json else out_path,
399
+ single,
400
+ max_chars=resolved.get("max_chars"),
401
+ extraction_method=resolved.get("extraction_method"),
402
+ confidence_retry=resolved.get("confidence_retry"),
403
+ model_profile=resolved.get("model_profile"),
404
+ on_result=on_result,
405
+ base_url=resolved.get("base_url"),
406
+ model=resolved.get("model"),
407
+ api_key=resolved.get("api_key"),
408
+ temperature=resolved.get("temperature"),
409
+ max_tokens=resolved.get("max_tokens"),
410
+ timeout=resolved.get("timeout"),
411
+ )
412
+
413
+ if non_json and out_path is not None:
414
+ # Normalise to {name: dict} even for single-input results
415
+ results_dict = results if not single else {input_pairs[0][1]: results}
416
+ out_path.parent.mkdir(parents=True, exist_ok=True)
417
+ if out_path.suffix == ".csv":
418
+ write_csv(results_dict, out_path)
419
+ else:
420
+ write_excel(results_dict, out_path)
421
+
422
+ return results
423
+
424
+
425
+ def summarize(
426
+ results: dict,
427
+ output: Union[PathLike, None] = None,
428
+ ) -> dict:
429
+ """Compute per-field null-rate and average confidence from extract() results.
430
+
431
+ Args:
432
+ results: The dict returned by :func:`extract` for multiple inputs —
433
+ ``{name: extracted_dict, ...}``.
434
+ output: Optional path to write the summary. A ``.csv`` extension writes
435
+ a CSV file with columns ``field``, ``null_rate``, ``avg_confidence``.
436
+
437
+ Returns:
438
+ ``{field_name: {"null_rate": float, "avg_confidence": float | None}, ...}``
439
+ """
440
+ summary = _summarize(results)
441
+ if output is not None:
442
+ p = Path(output)
443
+ p.parent.mkdir(parents=True, exist_ok=True)
444
+ write_summary_csv(summary, p)
445
+ return summary
446
+
447
+
448
+ def validate(
449
+ inputs: Union[PathLike, Text, list[Union[PathLike, Text]]],
450
+ schema: Union[dict, PathLike],
451
+ *,
452
+ config: Union[Config, None] = None,
453
+ null_threshold: float = 0.5,
454
+ confidence_threshold: float = 0.5,
455
+ output: Union[PathLike, None] = None,
456
+ base_url: Union[str, None] = None,
457
+ model: Union[str, None] = None,
458
+ api_key: Union[str, None] = None,
459
+ temperature: Union[float, None] = None,
460
+ max_tokens: Union[int, None] = None,
461
+ timeout: Union[int, None] = None,
462
+ ) -> dict:
463
+ """Dry-run extraction on sample documents and report per-field quality.
464
+
465
+ Runs :func:`extract` on the provided samples, then classifies each schema
466
+ field as OK, high-null, always-null, or low-confidence. Prints a formatted
467
+ table to stdout and returns the full report dict.
468
+
469
+ Use this to iterate on your schema before running a full batch extraction.
470
+
471
+ Args:
472
+ inputs: One or more sample documents (file paths or :class:`Text` instances).
473
+ schema: Schema dict or path to a schema JSON file.
474
+ config: :class:`Config` instance.
475
+ null_threshold: null_rate above which a field is flagged ``high_null``
476
+ (default 0.5).
477
+ confidence_threshold: avg_confidence below which a field is flagged
478
+ ``low_confidence`` (default 0.5).
479
+ output: Optional path to save the report as JSON.
480
+ base_url, model, api_key, temperature, max_tokens, timeout:
481
+ LLM overrides (same as :func:`extract`).
482
+
483
+ Returns:
484
+ Report dict with ``"fields"``, ``"counts"``, and threshold values.
485
+ """
486
+ # Normalise to a list so extract() always returns {name: dict}
487
+ if isinstance(inputs, (str, Path, Text)):
488
+ inputs = [inputs]
489
+
490
+ results = extract(
491
+ inputs,
492
+ schema,
493
+ config=config,
494
+ base_url=base_url,
495
+ model=model,
496
+ api_key=api_key,
497
+ temperature=temperature,
498
+ max_tokens=max_tokens,
499
+ timeout=timeout,
500
+ )
501
+
502
+ summary = _summarize(results)
503
+ report = build_validation_report(summary, null_threshold, confidence_threshold)
504
+
505
+ n = len(results)
506
+ names = list(results.keys())
507
+ sample_label = ", ".join(names[:3]) + (" …" if n > 3 else "")
508
+ print(f"Samples ({n}): {sample_label}")
509
+ print(f"Thresholds: null > {null_threshold:.0%} confidence < {confidence_threshold:.0%}\n")
510
+ print(format_validation_report(report))
511
+
512
+ if output is not None:
513
+ import json as _json_mod
514
+ p = Path(output)
515
+ p.parent.mkdir(parents=True, exist_ok=True)
516
+ with p.open("w") as f:
517
+ _json_mod.dump(report, f, indent=2)
518
+ f.write("\n")
519
+ print(f"\nReport saved to {output}")
520
+
521
+ return report
522
+
523
+
524
+ def _resolve_input(item: Union[PathLike, Text]) -> _TextPair:
525
+ """Resolve a file path or Text instance to a (text, name) tuple."""
526
+ if isinstance(item, Text):
527
+ return (item.content, item.name)
528
+ p = Path(item)
529
+ return (p.read_text(encoding="utf-8", errors="replace"), p.name)
530
+
531
+
532
+ def _resolve_description(description: Union[str, PathLike]) -> str:
533
+ import yaml
534
+ p = Path(description) if not isinstance(description, str) else None
535
+ if p is None and "\n" not in description and len(description) < 512:
536
+ candidate = Path(description)
537
+ if candidate.exists() and candidate.is_file():
538
+ p = candidate
539
+ if p is not None and p.exists():
540
+ with p.open() as f:
541
+ if p.suffix in (".yaml", ".yml"):
542
+ content = yaml.safe_load(f)
543
+ return yaml.dump(content, default_flow_style=False)
544
+ return f.read()
545
+ return str(description)