dataknobs-xization 1.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataknobs_xization/0.readme.txt +66 -0
- dataknobs_xization/__init__.py +110 -0
- dataknobs_xization/annotations.py +1476 -0
- dataknobs_xization/authorities.py +860 -0
- dataknobs_xization/content_transformer.py +570 -0
- dataknobs_xization/ingestion/__init__.py +27 -0
- dataknobs_xization/ingestion/config.py +352 -0
- dataknobs_xization/ingestion/processor.py +367 -0
- dataknobs_xization/json/__init__.py +17 -0
- dataknobs_xization/json/json_chunker.py +591 -0
- dataknobs_xization/lexicon.py +723 -0
- dataknobs_xization/markdown/__init__.py +72 -0
- dataknobs_xization/markdown/enrichment.py +260 -0
- dataknobs_xization/markdown/filters.py +236 -0
- dataknobs_xization/markdown/md_chunker.py +478 -0
- dataknobs_xization/markdown/md_parser.py +605 -0
- dataknobs_xization/markdown/md_streaming.py +302 -0
- dataknobs_xization/masking_tokenizer.py +768 -0
- dataknobs_xization/normalize.py +520 -0
- dataknobs_xization/py.typed +0 -0
- dataknobs_xization-1.2.3.dist-info/METADATA +170 -0
- dataknobs_xization-1.2.3.dist-info/RECORD +23 -0
- dataknobs_xization-1.2.3.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
"""Directory processor for knowledge base ingestion.
|
|
2
|
+
|
|
3
|
+
This module provides the DirectoryProcessor class for processing
|
|
4
|
+
documents from a directory into chunks ready for embedding.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any, Iterator, Literal
|
|
14
|
+
|
|
15
|
+
from dataknobs_xization.ingestion.config import (
|
|
16
|
+
FilePatternConfig,
|
|
17
|
+
KnowledgeBaseConfig,
|
|
18
|
+
)
|
|
19
|
+
from dataknobs_xization.json import JSONChunk, JSONChunkConfig, JSONChunker
|
|
20
|
+
from dataknobs_xization.markdown import (
|
|
21
|
+
ChunkQualityConfig,
|
|
22
|
+
HeadingInclusion,
|
|
23
|
+
chunk_markdown_tree,
|
|
24
|
+
parse_markdown,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class ProcessedDocument:
|
|
32
|
+
"""A processed document ready for embedding and storage.
|
|
33
|
+
|
|
34
|
+
Contains chunks from a single source file along with metadata
|
|
35
|
+
about the processing.
|
|
36
|
+
|
|
37
|
+
Attributes:
|
|
38
|
+
source_file: Path to the source file
|
|
39
|
+
document_type: Type of document (markdown, json, jsonl)
|
|
40
|
+
chunks: List of processed chunks
|
|
41
|
+
metadata: Document-level metadata
|
|
42
|
+
errors: Any errors encountered during processing
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
source_file: str
|
|
46
|
+
document_type: Literal["markdown", "json", "jsonl"]
|
|
47
|
+
chunks: list[dict[str, Any]]
|
|
48
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
49
|
+
errors: list[str] = field(default_factory=list)
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def chunk_count(self) -> int:
|
|
53
|
+
"""Number of chunks in this document."""
|
|
54
|
+
return len(self.chunks)
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def has_errors(self) -> bool:
|
|
58
|
+
"""Whether processing encountered errors."""
|
|
59
|
+
return len(self.errors) > 0
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# File size threshold for streaming (10MB)
|
|
63
|
+
STREAMING_THRESHOLD_BYTES = 10 * 1024 * 1024
|
|
64
|
+
|
|
65
|
+
# Config file names to always exclude from processing
|
|
66
|
+
CONFIG_FILE_NAMES = {"knowledge_base.json", "knowledge_base.yaml", "knowledge_base.yml"}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class DirectoryProcessor:
|
|
70
|
+
"""Process documents from a directory for knowledge base ingestion.
|
|
71
|
+
|
|
72
|
+
Handles markdown and JSON files with configurable chunking,
|
|
73
|
+
supporting both in-memory and streaming processing for large files.
|
|
74
|
+
|
|
75
|
+
Attributes:
|
|
76
|
+
config: Knowledge base configuration
|
|
77
|
+
root_dir: Root directory for processing
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def __init__(self, config: KnowledgeBaseConfig, root_dir: str | Path):
|
|
81
|
+
"""Initialize the directory processor.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
config: Knowledge base configuration
|
|
85
|
+
root_dir: Root directory containing documents
|
|
86
|
+
"""
|
|
87
|
+
self.config = config
|
|
88
|
+
self.root_dir = Path(root_dir)
|
|
89
|
+
|
|
90
|
+
def process(self) -> Iterator[ProcessedDocument]:
|
|
91
|
+
"""Process all documents in the directory.
|
|
92
|
+
|
|
93
|
+
Yields ProcessedDocument for each file, automatically using
|
|
94
|
+
streaming for large JSON files to avoid memory exhaustion.
|
|
95
|
+
|
|
96
|
+
Yields:
|
|
97
|
+
ProcessedDocument for each processed file
|
|
98
|
+
"""
|
|
99
|
+
# Collect all files first
|
|
100
|
+
files = self._collect_files()
|
|
101
|
+
|
|
102
|
+
for filepath in files:
|
|
103
|
+
rel_path = filepath.relative_to(self.root_dir)
|
|
104
|
+
|
|
105
|
+
# Skip config files
|
|
106
|
+
if filepath.name in CONFIG_FILE_NAMES:
|
|
107
|
+
logger.debug(f"Skipping config file: {rel_path}")
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
# Skip excluded files
|
|
111
|
+
if self.config.is_excluded(rel_path):
|
|
112
|
+
logger.debug(f"Skipping excluded file: {rel_path}")
|
|
113
|
+
continue
|
|
114
|
+
|
|
115
|
+
# Get pattern config if any
|
|
116
|
+
pattern_config = self.config.get_pattern_config(rel_path)
|
|
117
|
+
|
|
118
|
+
# Process based on file type
|
|
119
|
+
suffix = filepath.suffix.lower()
|
|
120
|
+
if suffix == ".md":
|
|
121
|
+
yield from self._process_markdown(filepath, pattern_config)
|
|
122
|
+
elif suffix in (".json", ".jsonl", ".ndjson"):
|
|
123
|
+
yield from self._process_json(filepath, pattern_config)
|
|
124
|
+
elif suffix == ".gz":
|
|
125
|
+
# Check inner extension for compressed files
|
|
126
|
+
inner_suffix = Path(filepath.stem).suffix.lower()
|
|
127
|
+
if inner_suffix in (".json", ".jsonl", ".ndjson"):
|
|
128
|
+
yield from self._process_json(filepath, pattern_config)
|
|
129
|
+
else:
|
|
130
|
+
logger.debug(f"Skipping unsupported compressed file: {rel_path}")
|
|
131
|
+
else:
|
|
132
|
+
logger.debug(f"Skipping unsupported file type: {rel_path}")
|
|
133
|
+
|
|
134
|
+
def _collect_files(self) -> list[Path]:
|
|
135
|
+
"""Collect all files to process from the directory.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
List of file paths
|
|
139
|
+
"""
|
|
140
|
+
files = []
|
|
141
|
+
|
|
142
|
+
# If patterns are defined, use them to find files
|
|
143
|
+
if self.config.patterns:
|
|
144
|
+
for pattern_config in self.config.patterns:
|
|
145
|
+
if pattern_config.enabled:
|
|
146
|
+
for filepath in self.root_dir.glob(pattern_config.pattern):
|
|
147
|
+
if filepath.is_file() and filepath not in files:
|
|
148
|
+
files.append(filepath)
|
|
149
|
+
else:
|
|
150
|
+
# Default: find all supported files
|
|
151
|
+
for pattern in ["**/*.md", "**/*.json", "**/*.jsonl", "**/*.ndjson",
|
|
152
|
+
"**/*.json.gz", "**/*.jsonl.gz", "**/*.ndjson.gz"]:
|
|
153
|
+
for filepath in self.root_dir.glob(pattern):
|
|
154
|
+
if filepath.is_file() and filepath not in files:
|
|
155
|
+
files.append(filepath)
|
|
156
|
+
|
|
157
|
+
return sorted(files)
|
|
158
|
+
|
|
159
|
+
def _process_markdown(
|
|
160
|
+
self,
|
|
161
|
+
filepath: Path,
|
|
162
|
+
pattern_config: FilePatternConfig | None,
|
|
163
|
+
) -> Iterator[ProcessedDocument]:
|
|
164
|
+
"""Process a markdown file.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
filepath: Path to markdown file
|
|
168
|
+
pattern_config: Optional pattern-specific configuration
|
|
169
|
+
|
|
170
|
+
Yields:
|
|
171
|
+
ProcessedDocument for the file
|
|
172
|
+
"""
|
|
173
|
+
errors: list[str] = []
|
|
174
|
+
chunks: list[dict[str, Any]] = []
|
|
175
|
+
|
|
176
|
+
try:
|
|
177
|
+
# Read file
|
|
178
|
+
with open(filepath, encoding="utf-8") as f:
|
|
179
|
+
content = f.read()
|
|
180
|
+
|
|
181
|
+
# Get chunking config
|
|
182
|
+
chunking_config = self.config.get_chunking_config(
|
|
183
|
+
filepath.relative_to(self.root_dir)
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# Build quality filter if configured
|
|
187
|
+
quality_filter = None
|
|
188
|
+
if self.config.default_quality_filter:
|
|
189
|
+
quality_filter = ChunkQualityConfig(**self.config.default_quality_filter)
|
|
190
|
+
|
|
191
|
+
# Parse and chunk
|
|
192
|
+
tree = parse_markdown(content)
|
|
193
|
+
md_chunks = chunk_markdown_tree(
|
|
194
|
+
tree,
|
|
195
|
+
max_chunk_size=chunking_config.get("max_chunk_size", 500),
|
|
196
|
+
chunk_overlap=chunking_config.get("chunk_overlap", 50),
|
|
197
|
+
heading_inclusion=HeadingInclusion.IN_METADATA,
|
|
198
|
+
combine_under_heading=chunking_config.get("combine_under_heading", True),
|
|
199
|
+
quality_filter=quality_filter,
|
|
200
|
+
generate_embeddings=True,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# Convert to dictionaries
|
|
204
|
+
for i, chunk in enumerate(md_chunks):
|
|
205
|
+
chunk_dict = {
|
|
206
|
+
"text": chunk.text,
|
|
207
|
+
"embedding_text": chunk.metadata.embedding_text or chunk.text,
|
|
208
|
+
"chunk_index": i,
|
|
209
|
+
"source_path": "",
|
|
210
|
+
"metadata": {
|
|
211
|
+
"heading_path": chunk.metadata.heading_display or chunk.metadata.get_heading_path(),
|
|
212
|
+
"headings": chunk.metadata.headings,
|
|
213
|
+
"heading_levels": chunk.metadata.heading_levels,
|
|
214
|
+
"line_number": chunk.metadata.line_number,
|
|
215
|
+
"chunk_size": chunk.metadata.chunk_size,
|
|
216
|
+
},
|
|
217
|
+
}
|
|
218
|
+
chunks.append(chunk_dict)
|
|
219
|
+
|
|
220
|
+
except Exception as e:
|
|
221
|
+
errors.append(f"Failed to process markdown: {e}")
|
|
222
|
+
logger.error(f"Error processing {filepath}: {e}")
|
|
223
|
+
|
|
224
|
+
# Build metadata
|
|
225
|
+
metadata = self.config.get_metadata(filepath.relative_to(self.root_dir))
|
|
226
|
+
|
|
227
|
+
yield ProcessedDocument(
|
|
228
|
+
source_file=str(filepath),
|
|
229
|
+
document_type="markdown",
|
|
230
|
+
chunks=chunks,
|
|
231
|
+
metadata=metadata,
|
|
232
|
+
errors=errors,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
def _process_json(
|
|
236
|
+
self,
|
|
237
|
+
filepath: Path,
|
|
238
|
+
pattern_config: FilePatternConfig | None,
|
|
239
|
+
) -> Iterator[ProcessedDocument]:
|
|
240
|
+
"""Process a JSON or JSONL file.
|
|
241
|
+
|
|
242
|
+
Automatically uses streaming for large files or JSONL format.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
filepath: Path to JSON file
|
|
246
|
+
pattern_config: Optional pattern-specific configuration
|
|
247
|
+
|
|
248
|
+
Yields:
|
|
249
|
+
ProcessedDocument for the file
|
|
250
|
+
"""
|
|
251
|
+
errors: list[str] = []
|
|
252
|
+
chunks: list[dict[str, Any]] = []
|
|
253
|
+
|
|
254
|
+
try:
|
|
255
|
+
# Build JSON chunker config
|
|
256
|
+
chunking_config = self.config.get_chunking_config(
|
|
257
|
+
filepath.relative_to(self.root_dir)
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
json_config = JSONChunkConfig(
|
|
261
|
+
max_chunk_size=chunking_config.get("max_chunk_size", 1000),
|
|
262
|
+
nested_separator=chunking_config.get("nested_separator", "."),
|
|
263
|
+
array_handling=chunking_config.get("array_handling", "expand"),
|
|
264
|
+
include_field_names=chunking_config.get("include_field_names", True),
|
|
265
|
+
skip_technical_fields=chunking_config.get("skip_technical_fields", True),
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# Apply pattern-specific overrides
|
|
269
|
+
if pattern_config:
|
|
270
|
+
if pattern_config.text_template:
|
|
271
|
+
json_config.text_template = pattern_config.text_template
|
|
272
|
+
if pattern_config.text_fields:
|
|
273
|
+
json_config.text_fields = pattern_config.text_fields
|
|
274
|
+
|
|
275
|
+
chunker = JSONChunker(json_config)
|
|
276
|
+
|
|
277
|
+
# Determine if we should stream
|
|
278
|
+
is_jsonl = self._is_jsonl_file(str(filepath))
|
|
279
|
+
file_size = os.path.getsize(filepath)
|
|
280
|
+
should_stream = is_jsonl or file_size > STREAMING_THRESHOLD_BYTES
|
|
281
|
+
|
|
282
|
+
if should_stream:
|
|
283
|
+
# Stream chunks for large files or JSONL
|
|
284
|
+
for json_chunk in chunker.stream_chunks(filepath):
|
|
285
|
+
chunk_dict = self._json_chunk_to_dict(json_chunk)
|
|
286
|
+
chunks.append(chunk_dict)
|
|
287
|
+
else:
|
|
288
|
+
# Load and chunk in memory for small files
|
|
289
|
+
import json as json_lib
|
|
290
|
+
with open(filepath, encoding="utf-8") as f:
|
|
291
|
+
data = json_lib.load(f)
|
|
292
|
+
|
|
293
|
+
for json_chunk in chunker.chunk(data, source=str(filepath)):
|
|
294
|
+
chunk_dict = self._json_chunk_to_dict(json_chunk)
|
|
295
|
+
chunks.append(chunk_dict)
|
|
296
|
+
|
|
297
|
+
except Exception as e:
|
|
298
|
+
errors.append(f"Failed to process JSON: {e}")
|
|
299
|
+
logger.error(f"Error processing {filepath}: {e}")
|
|
300
|
+
|
|
301
|
+
# Build metadata
|
|
302
|
+
metadata = self.config.get_metadata(filepath.relative_to(self.root_dir))
|
|
303
|
+
|
|
304
|
+
# Determine document type
|
|
305
|
+
doc_type: Literal["json", "jsonl"] = "jsonl" if self._is_jsonl_file(str(filepath)) else "json"
|
|
306
|
+
|
|
307
|
+
yield ProcessedDocument(
|
|
308
|
+
source_file=str(filepath),
|
|
309
|
+
document_type=doc_type,
|
|
310
|
+
chunks=chunks,
|
|
311
|
+
metadata=metadata,
|
|
312
|
+
errors=errors,
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
def _json_chunk_to_dict(self, chunk: JSONChunk) -> dict[str, Any]:
|
|
316
|
+
"""Convert a JSONChunk to a dictionary.
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
chunk: JSONChunk instance
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
Dictionary representation
|
|
323
|
+
"""
|
|
324
|
+
return {
|
|
325
|
+
"text": chunk.text,
|
|
326
|
+
"embedding_text": chunk.embedding_text or chunk.text,
|
|
327
|
+
"chunk_index": chunk.chunk_index,
|
|
328
|
+
"source_path": chunk.source_path,
|
|
329
|
+
"metadata": chunk.metadata,
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
def _is_jsonl_file(self, filepath: str) -> bool:
|
|
333
|
+
"""Check if a file is JSONL format based on extension.
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
filepath: Path to check
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
True if file is JSONL format
|
|
340
|
+
"""
|
|
341
|
+
filepath_lower = filepath.lower()
|
|
342
|
+
return any(
|
|
343
|
+
filepath_lower.endswith(ext)
|
|
344
|
+
for ext in [".jsonl", ".ndjson", ".jsonl.gz", ".ndjson.gz"]
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def process_directory(
|
|
349
|
+
directory: str | Path,
|
|
350
|
+
config: KnowledgeBaseConfig | None = None,
|
|
351
|
+
) -> Iterator[ProcessedDocument]:
|
|
352
|
+
"""Convenience function to process a directory.
|
|
353
|
+
|
|
354
|
+
Args:
|
|
355
|
+
directory: Directory to process
|
|
356
|
+
config: Optional configuration (loads from directory if not provided)
|
|
357
|
+
|
|
358
|
+
Yields:
|
|
359
|
+
ProcessedDocument for each file
|
|
360
|
+
"""
|
|
361
|
+
directory = Path(directory)
|
|
362
|
+
|
|
363
|
+
if config is None:
|
|
364
|
+
config = KnowledgeBaseConfig.load(directory)
|
|
365
|
+
|
|
366
|
+
processor = DirectoryProcessor(config, directory)
|
|
367
|
+
yield from processor.process()
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""JSON chunking utilities for RAG applications.
|
|
2
|
+
|
|
3
|
+
This module provides utilities for chunking JSON data (objects, arrays, JSONL files)
|
|
4
|
+
into units suitable for RAG (Retrieval-Augmented Generation) applications.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataknobs_xization.json.json_chunker import (
|
|
8
|
+
JSONChunk,
|
|
9
|
+
JSONChunkConfig,
|
|
10
|
+
JSONChunker,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"JSONChunk",
|
|
15
|
+
"JSONChunkConfig",
|
|
16
|
+
"JSONChunker",
|
|
17
|
+
]
|