dataknobs-xization 1.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataknobs_xization/0.readme.txt +66 -0
- dataknobs_xization/__init__.py +110 -0
- dataknobs_xization/annotations.py +1476 -0
- dataknobs_xization/authorities.py +860 -0
- dataknobs_xization/content_transformer.py +570 -0
- dataknobs_xization/ingestion/__init__.py +27 -0
- dataknobs_xization/ingestion/config.py +352 -0
- dataknobs_xization/ingestion/processor.py +367 -0
- dataknobs_xization/json/__init__.py +17 -0
- dataknobs_xization/json/json_chunker.py +591 -0
- dataknobs_xization/lexicon.py +723 -0
- dataknobs_xization/markdown/__init__.py +72 -0
- dataknobs_xization/markdown/enrichment.py +260 -0
- dataknobs_xization/markdown/filters.py +236 -0
- dataknobs_xization/markdown/md_chunker.py +478 -0
- dataknobs_xization/markdown/md_parser.py +605 -0
- dataknobs_xization/markdown/md_streaming.py +302 -0
- dataknobs_xization/masking_tokenizer.py +768 -0
- dataknobs_xization/normalize.py +520 -0
- dataknobs_xization/py.typed +0 -0
- dataknobs_xization-1.2.3.dist-info/METADATA +170 -0
- dataknobs_xization-1.2.3.dist-info/RECORD +23 -0
- dataknobs_xization-1.2.3.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
"""Configuration schema for knowledge base ingestion.
|
|
2
|
+
|
|
3
|
+
This module provides configuration classes for loading and processing
|
|
4
|
+
documents from a directory into a knowledge base.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import logging
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class IngestionConfigError(Exception):
|
|
19
|
+
"""Error related to ingestion configuration."""
|
|
20
|
+
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class FilePatternConfig:
|
|
26
|
+
"""Configuration for a specific file pattern.
|
|
27
|
+
|
|
28
|
+
Allows overriding chunking and metadata settings for files
|
|
29
|
+
matching a glob pattern.
|
|
30
|
+
|
|
31
|
+
Attributes:
|
|
32
|
+
pattern: Glob pattern to match files (e.g., "api/**/*.json")
|
|
33
|
+
enabled: Whether to process files matching this pattern
|
|
34
|
+
chunking: Override chunking settings for matched files
|
|
35
|
+
text_template: Jinja2 template for JSON text generation
|
|
36
|
+
text_fields: Fields to use for text generation (JSON)
|
|
37
|
+
metadata_fields: Fields to include in chunk metadata
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
pattern: str
|
|
41
|
+
enabled: bool = True
|
|
42
|
+
chunking: dict[str, Any] | None = None
|
|
43
|
+
text_template: str | None = None
|
|
44
|
+
text_fields: list[str] | None = None
|
|
45
|
+
metadata_fields: list[str] | None = None
|
|
46
|
+
|
|
47
|
+
def to_dict(self) -> dict[str, Any]:
|
|
48
|
+
"""Convert to dictionary representation."""
|
|
49
|
+
result: dict[str, Any] = {"pattern": self.pattern}
|
|
50
|
+
if not self.enabled:
|
|
51
|
+
result["enabled"] = False
|
|
52
|
+
if self.chunking:
|
|
53
|
+
result["chunking"] = self.chunking
|
|
54
|
+
if self.text_template:
|
|
55
|
+
result["text_template"] = self.text_template
|
|
56
|
+
if self.text_fields:
|
|
57
|
+
result["text_fields"] = self.text_fields
|
|
58
|
+
if self.metadata_fields:
|
|
59
|
+
result["metadata_fields"] = self.metadata_fields
|
|
60
|
+
return result
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
def from_dict(cls, data: dict[str, Any]) -> FilePatternConfig:
|
|
64
|
+
"""Create from dictionary representation."""
|
|
65
|
+
return cls(
|
|
66
|
+
pattern=data["pattern"],
|
|
67
|
+
enabled=data.get("enabled", True),
|
|
68
|
+
chunking=data.get("chunking"),
|
|
69
|
+
text_template=data.get("text_template"),
|
|
70
|
+
text_fields=data.get("text_fields"),
|
|
71
|
+
metadata_fields=data.get("metadata_fields"),
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class KnowledgeBaseConfig:
|
|
77
|
+
r"""Configuration for knowledge base ingestion from a directory.
|
|
78
|
+
|
|
79
|
+
Defines how documents in a directory should be processed, chunked,
|
|
80
|
+
and prepared for embedding. Supports glob-pattern based overrides
|
|
81
|
+
for different file types.
|
|
82
|
+
|
|
83
|
+
Attributes:
|
|
84
|
+
name: Name of the knowledge base
|
|
85
|
+
default_chunking: Default chunking settings for all files
|
|
86
|
+
default_quality_filter: Default quality filter settings
|
|
87
|
+
patterns: List of file pattern configurations with overrides
|
|
88
|
+
exclude_patterns: Glob patterns for files to skip
|
|
89
|
+
default_metadata: Metadata to attach to all chunks
|
|
90
|
+
|
|
91
|
+
Example:
|
|
92
|
+
```yaml
|
|
93
|
+
name: product-docs
|
|
94
|
+
default_chunking:
|
|
95
|
+
max_chunk_size: 500
|
|
96
|
+
chunk_overlap: 50
|
|
97
|
+
|
|
98
|
+
patterns:
|
|
99
|
+
- pattern: "api/**/*.json"
|
|
100
|
+
text_template: "API: {{ method }} {{ path }}\\n{{ description }}"
|
|
101
|
+
metadata_fields: [method, path, auth_required]
|
|
102
|
+
|
|
103
|
+
- pattern: "guides/**/*.md"
|
|
104
|
+
chunking:
|
|
105
|
+
max_chunk_size: 800
|
|
106
|
+
|
|
107
|
+
exclude_patterns:
|
|
108
|
+
- "**/drafts/**"
|
|
109
|
+
- "**/.git/**"
|
|
110
|
+
```
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
name: str
|
|
114
|
+
default_chunking: dict[str, Any] = field(default_factory=lambda: {
|
|
115
|
+
"max_chunk_size": 500,
|
|
116
|
+
"chunk_overlap": 50,
|
|
117
|
+
})
|
|
118
|
+
default_quality_filter: dict[str, Any] | None = None
|
|
119
|
+
patterns: list[FilePatternConfig] = field(default_factory=list)
|
|
120
|
+
exclude_patterns: list[str] = field(default_factory=list)
|
|
121
|
+
default_metadata: dict[str, Any] = field(default_factory=dict)
|
|
122
|
+
|
|
123
|
+
@classmethod
|
|
124
|
+
def load(cls, directory: str | Path) -> KnowledgeBaseConfig:
|
|
125
|
+
"""Load configuration from a directory.
|
|
126
|
+
|
|
127
|
+
Looks for `knowledge_base.yaml`, `knowledge_base.yml`, or
|
|
128
|
+
`knowledge_base.json` in the directory.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
directory: Directory containing the config file
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
Loaded KnowledgeBaseConfig instance
|
|
135
|
+
|
|
136
|
+
Raises:
|
|
137
|
+
IngestionConfigError: If config file is invalid or missing
|
|
138
|
+
"""
|
|
139
|
+
directory = Path(directory)
|
|
140
|
+
config_path = cls._find_config_file(directory)
|
|
141
|
+
|
|
142
|
+
if config_path is None:
|
|
143
|
+
# Return default config with directory name
|
|
144
|
+
logger.debug(
|
|
145
|
+
f"No knowledge_base config found in {directory}, using defaults"
|
|
146
|
+
)
|
|
147
|
+
return cls(name=directory.name)
|
|
148
|
+
|
|
149
|
+
try:
|
|
150
|
+
data = cls._load_file(config_path)
|
|
151
|
+
except Exception as e:
|
|
152
|
+
raise IngestionConfigError(
|
|
153
|
+
f"Failed to load config from {config_path}: {e}"
|
|
154
|
+
) from e
|
|
155
|
+
|
|
156
|
+
return cls.from_dict(data, default_name=directory.name)
|
|
157
|
+
|
|
158
|
+
@classmethod
|
|
159
|
+
def from_dict(
|
|
160
|
+
cls,
|
|
161
|
+
data: dict[str, Any],
|
|
162
|
+
default_name: str = "knowledge_base",
|
|
163
|
+
) -> KnowledgeBaseConfig:
|
|
164
|
+
"""Create from dictionary representation.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
data: Configuration dictionary
|
|
168
|
+
default_name: Default name if not specified in data
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
KnowledgeBaseConfig instance
|
|
172
|
+
"""
|
|
173
|
+
patterns = [
|
|
174
|
+
FilePatternConfig.from_dict(p) if isinstance(p, dict) else p
|
|
175
|
+
for p in data.get("patterns", [])
|
|
176
|
+
]
|
|
177
|
+
|
|
178
|
+
return cls(
|
|
179
|
+
name=data.get("name", default_name),
|
|
180
|
+
default_chunking=data.get("default_chunking", {
|
|
181
|
+
"max_chunk_size": 500,
|
|
182
|
+
"chunk_overlap": 50,
|
|
183
|
+
}),
|
|
184
|
+
default_quality_filter=data.get("default_quality_filter"),
|
|
185
|
+
patterns=patterns,
|
|
186
|
+
exclude_patterns=data.get("exclude_patterns", []),
|
|
187
|
+
default_metadata=data.get("default_metadata", {}),
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
@classmethod
|
|
191
|
+
def _find_config_file(cls, directory: Path) -> Path | None:
|
|
192
|
+
"""Find the config file in a directory.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
directory: Directory to search
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
Path to config file, or None if not found
|
|
199
|
+
"""
|
|
200
|
+
for name in ["knowledge_base.yaml", "knowledge_base.yml", "knowledge_base.json"]:
|
|
201
|
+
path = directory / name
|
|
202
|
+
if path.exists():
|
|
203
|
+
return path
|
|
204
|
+
return None
|
|
205
|
+
|
|
206
|
+
@classmethod
|
|
207
|
+
def _load_file(cls, path: Path) -> dict[str, Any]:
|
|
208
|
+
"""Load and parse a config file.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
path: Path to config file
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Parsed configuration dictionary
|
|
215
|
+
"""
|
|
216
|
+
with open(path, encoding="utf-8") as f:
|
|
217
|
+
if path.suffix in [".yaml", ".yml"]:
|
|
218
|
+
try:
|
|
219
|
+
import yaml
|
|
220
|
+
data = yaml.safe_load(f)
|
|
221
|
+
except ImportError as err:
|
|
222
|
+
raise IngestionConfigError(
|
|
223
|
+
"PyYAML is required to load YAML config files. "
|
|
224
|
+
"Install with: pip install pyyaml"
|
|
225
|
+
) from err
|
|
226
|
+
else:
|
|
227
|
+
data = json.load(f)
|
|
228
|
+
|
|
229
|
+
if not isinstance(data, dict):
|
|
230
|
+
raise IngestionConfigError(
|
|
231
|
+
f"Config file must contain a dictionary: {path}"
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
return data
|
|
235
|
+
|
|
236
|
+
def to_dict(self) -> dict[str, Any]:
|
|
237
|
+
"""Convert to dictionary representation."""
|
|
238
|
+
result: dict[str, Any] = {"name": self.name}
|
|
239
|
+
|
|
240
|
+
if self.default_chunking:
|
|
241
|
+
result["default_chunking"] = self.default_chunking
|
|
242
|
+
|
|
243
|
+
if self.default_quality_filter:
|
|
244
|
+
result["default_quality_filter"] = self.default_quality_filter
|
|
245
|
+
|
|
246
|
+
if self.patterns:
|
|
247
|
+
result["patterns"] = [p.to_dict() for p in self.patterns]
|
|
248
|
+
|
|
249
|
+
if self.exclude_patterns:
|
|
250
|
+
result["exclude_patterns"] = self.exclude_patterns
|
|
251
|
+
|
|
252
|
+
if self.default_metadata:
|
|
253
|
+
result["default_metadata"] = self.default_metadata
|
|
254
|
+
|
|
255
|
+
return result
|
|
256
|
+
|
|
257
|
+
def get_pattern_config(self, filepath: str | Path) -> FilePatternConfig | None:
|
|
258
|
+
"""Get the pattern config that matches a file path.
|
|
259
|
+
|
|
260
|
+
Returns the first matching pattern config, or None if no pattern matches.
|
|
261
|
+
Patterns are checked in order, so more specific patterns should come first.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
filepath: Path to check (relative to knowledge base root)
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
Matching FilePatternConfig, or None
|
|
268
|
+
"""
|
|
269
|
+
filepath = Path(filepath)
|
|
270
|
+
|
|
271
|
+
for pattern_config in self.patterns:
|
|
272
|
+
if pattern_config.enabled and self._matches_pattern(filepath, pattern_config.pattern):
|
|
273
|
+
return pattern_config
|
|
274
|
+
|
|
275
|
+
return None
|
|
276
|
+
|
|
277
|
+
def is_excluded(self, filepath: str | Path) -> bool:
|
|
278
|
+
"""Check if a file path matches any exclude pattern.
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
filepath: Path to check (relative to knowledge base root)
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
True if file should be excluded
|
|
285
|
+
"""
|
|
286
|
+
filepath = Path(filepath)
|
|
287
|
+
|
|
288
|
+
for pattern in self.exclude_patterns:
|
|
289
|
+
if self._matches_pattern(filepath, pattern):
|
|
290
|
+
return True
|
|
291
|
+
|
|
292
|
+
return False
|
|
293
|
+
|
|
294
|
+
def _matches_pattern(self, filepath: Path, pattern: str) -> bool:
|
|
295
|
+
"""Check if a filepath matches a glob pattern.
|
|
296
|
+
|
|
297
|
+
Handles both fnmatch-style and glob-style patterns including `**`.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
filepath: Path to check
|
|
301
|
+
pattern: Glob pattern
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
True if path matches pattern
|
|
305
|
+
"""
|
|
306
|
+
from fnmatch import fnmatch
|
|
307
|
+
|
|
308
|
+
filepath_str = str(filepath)
|
|
309
|
+
|
|
310
|
+
# Handle ** patterns by using Path.match for recursive matching
|
|
311
|
+
if "**" in pattern:
|
|
312
|
+
# Path.match handles ** as recursive glob
|
|
313
|
+
return filepath.match(pattern)
|
|
314
|
+
else:
|
|
315
|
+
# Use fnmatch for simple patterns
|
|
316
|
+
return fnmatch(filepath_str, pattern)
|
|
317
|
+
|
|
318
|
+
def get_chunking_config(self, filepath: str | Path) -> dict[str, Any]:
|
|
319
|
+
"""Get the effective chunking config for a file.
|
|
320
|
+
|
|
321
|
+
Merges default chunking with any pattern-specific overrides.
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
filepath: Path to file
|
|
325
|
+
|
|
326
|
+
Returns:
|
|
327
|
+
Merged chunking configuration
|
|
328
|
+
"""
|
|
329
|
+
config = self.default_chunking.copy()
|
|
330
|
+
|
|
331
|
+
pattern_config = self.get_pattern_config(filepath)
|
|
332
|
+
if pattern_config and pattern_config.chunking:
|
|
333
|
+
config.update(pattern_config.chunking)
|
|
334
|
+
|
|
335
|
+
return config
|
|
336
|
+
|
|
337
|
+
def get_metadata(self, filepath: str | Path) -> dict[str, Any]:
|
|
338
|
+
"""Get the effective metadata for a file.
|
|
339
|
+
|
|
340
|
+
Includes default metadata plus source file info.
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
filepath: Path to file
|
|
344
|
+
|
|
345
|
+
Returns:
|
|
346
|
+
Metadata dictionary
|
|
347
|
+
"""
|
|
348
|
+
filepath = Path(filepath)
|
|
349
|
+
metadata = self.default_metadata.copy()
|
|
350
|
+
metadata["source"] = str(filepath)
|
|
351
|
+
metadata["filename"] = filepath.name
|
|
352
|
+
return metadata
|