dataknobs-xization 1.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,352 @@
1
+ """Configuration schema for knowledge base ingestion.
2
+
3
+ This module provides configuration classes for loading and processing
4
+ documents from a directory into a knowledge base.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import logging
11
+ from dataclasses import dataclass, field
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class IngestionConfigError(Exception):
19
+ """Error related to ingestion configuration."""
20
+
21
+ pass
22
+
23
+
24
+ @dataclass
25
+ class FilePatternConfig:
26
+ """Configuration for a specific file pattern.
27
+
28
+ Allows overriding chunking and metadata settings for files
29
+ matching a glob pattern.
30
+
31
+ Attributes:
32
+ pattern: Glob pattern to match files (e.g., "api/**/*.json")
33
+ enabled: Whether to process files matching this pattern
34
+ chunking: Override chunking settings for matched files
35
+ text_template: Jinja2 template for JSON text generation
36
+ text_fields: Fields to use for text generation (JSON)
37
+ metadata_fields: Fields to include in chunk metadata
38
+ """
39
+
40
+ pattern: str
41
+ enabled: bool = True
42
+ chunking: dict[str, Any] | None = None
43
+ text_template: str | None = None
44
+ text_fields: list[str] | None = None
45
+ metadata_fields: list[str] | None = None
46
+
47
+ def to_dict(self) -> dict[str, Any]:
48
+ """Convert to dictionary representation."""
49
+ result: dict[str, Any] = {"pattern": self.pattern}
50
+ if not self.enabled:
51
+ result["enabled"] = False
52
+ if self.chunking:
53
+ result["chunking"] = self.chunking
54
+ if self.text_template:
55
+ result["text_template"] = self.text_template
56
+ if self.text_fields:
57
+ result["text_fields"] = self.text_fields
58
+ if self.metadata_fields:
59
+ result["metadata_fields"] = self.metadata_fields
60
+ return result
61
+
62
+ @classmethod
63
+ def from_dict(cls, data: dict[str, Any]) -> FilePatternConfig:
64
+ """Create from dictionary representation."""
65
+ return cls(
66
+ pattern=data["pattern"],
67
+ enabled=data.get("enabled", True),
68
+ chunking=data.get("chunking"),
69
+ text_template=data.get("text_template"),
70
+ text_fields=data.get("text_fields"),
71
+ metadata_fields=data.get("metadata_fields"),
72
+ )
73
+
74
+
75
+ @dataclass
76
+ class KnowledgeBaseConfig:
77
+ r"""Configuration for knowledge base ingestion from a directory.
78
+
79
+ Defines how documents in a directory should be processed, chunked,
80
+ and prepared for embedding. Supports glob-pattern based overrides
81
+ for different file types.
82
+
83
+ Attributes:
84
+ name: Name of the knowledge base
85
+ default_chunking: Default chunking settings for all files
86
+ default_quality_filter: Default quality filter settings
87
+ patterns: List of file pattern configurations with overrides
88
+ exclude_patterns: Glob patterns for files to skip
89
+ default_metadata: Metadata to attach to all chunks
90
+
91
+ Example:
92
+ ```yaml
93
+ name: product-docs
94
+ default_chunking:
95
+ max_chunk_size: 500
96
+ chunk_overlap: 50
97
+
98
+ patterns:
99
+ - pattern: "api/**/*.json"
100
+ text_template: "API: {{ method }} {{ path }}\\n{{ description }}"
101
+ metadata_fields: [method, path, auth_required]
102
+
103
+ - pattern: "guides/**/*.md"
104
+ chunking:
105
+ max_chunk_size: 800
106
+
107
+ exclude_patterns:
108
+ - "**/drafts/**"
109
+ - "**/.git/**"
110
+ ```
111
+ """
112
+
113
+ name: str
114
+ default_chunking: dict[str, Any] = field(default_factory=lambda: {
115
+ "max_chunk_size": 500,
116
+ "chunk_overlap": 50,
117
+ })
118
+ default_quality_filter: dict[str, Any] | None = None
119
+ patterns: list[FilePatternConfig] = field(default_factory=list)
120
+ exclude_patterns: list[str] = field(default_factory=list)
121
+ default_metadata: dict[str, Any] = field(default_factory=dict)
122
+
123
+ @classmethod
124
+ def load(cls, directory: str | Path) -> KnowledgeBaseConfig:
125
+ """Load configuration from a directory.
126
+
127
+ Looks for `knowledge_base.yaml`, `knowledge_base.yml`, or
128
+ `knowledge_base.json` in the directory.
129
+
130
+ Args:
131
+ directory: Directory containing the config file
132
+
133
+ Returns:
134
+ Loaded KnowledgeBaseConfig instance
135
+
136
+ Raises:
137
+ IngestionConfigError: If config file is invalid or missing
138
+ """
139
+ directory = Path(directory)
140
+ config_path = cls._find_config_file(directory)
141
+
142
+ if config_path is None:
143
+ # Return default config with directory name
144
+ logger.debug(
145
+ f"No knowledge_base config found in {directory}, using defaults"
146
+ )
147
+ return cls(name=directory.name)
148
+
149
+ try:
150
+ data = cls._load_file(config_path)
151
+ except Exception as e:
152
+ raise IngestionConfigError(
153
+ f"Failed to load config from {config_path}: {e}"
154
+ ) from e
155
+
156
+ return cls.from_dict(data, default_name=directory.name)
157
+
158
+ @classmethod
159
+ def from_dict(
160
+ cls,
161
+ data: dict[str, Any],
162
+ default_name: str = "knowledge_base",
163
+ ) -> KnowledgeBaseConfig:
164
+ """Create from dictionary representation.
165
+
166
+ Args:
167
+ data: Configuration dictionary
168
+ default_name: Default name if not specified in data
169
+
170
+ Returns:
171
+ KnowledgeBaseConfig instance
172
+ """
173
+ patterns = [
174
+ FilePatternConfig.from_dict(p) if isinstance(p, dict) else p
175
+ for p in data.get("patterns", [])
176
+ ]
177
+
178
+ return cls(
179
+ name=data.get("name", default_name),
180
+ default_chunking=data.get("default_chunking", {
181
+ "max_chunk_size": 500,
182
+ "chunk_overlap": 50,
183
+ }),
184
+ default_quality_filter=data.get("default_quality_filter"),
185
+ patterns=patterns,
186
+ exclude_patterns=data.get("exclude_patterns", []),
187
+ default_metadata=data.get("default_metadata", {}),
188
+ )
189
+
190
+ @classmethod
191
+ def _find_config_file(cls, directory: Path) -> Path | None:
192
+ """Find the config file in a directory.
193
+
194
+ Args:
195
+ directory: Directory to search
196
+
197
+ Returns:
198
+ Path to config file, or None if not found
199
+ """
200
+ for name in ["knowledge_base.yaml", "knowledge_base.yml", "knowledge_base.json"]:
201
+ path = directory / name
202
+ if path.exists():
203
+ return path
204
+ return None
205
+
206
+ @classmethod
207
+ def _load_file(cls, path: Path) -> dict[str, Any]:
208
+ """Load and parse a config file.
209
+
210
+ Args:
211
+ path: Path to config file
212
+
213
+ Returns:
214
+ Parsed configuration dictionary
215
+ """
216
+ with open(path, encoding="utf-8") as f:
217
+ if path.suffix in [".yaml", ".yml"]:
218
+ try:
219
+ import yaml
220
+ data = yaml.safe_load(f)
221
+ except ImportError as err:
222
+ raise IngestionConfigError(
223
+ "PyYAML is required to load YAML config files. "
224
+ "Install with: pip install pyyaml"
225
+ ) from err
226
+ else:
227
+ data = json.load(f)
228
+
229
+ if not isinstance(data, dict):
230
+ raise IngestionConfigError(
231
+ f"Config file must contain a dictionary: {path}"
232
+ )
233
+
234
+ return data
235
+
236
+ def to_dict(self) -> dict[str, Any]:
237
+ """Convert to dictionary representation."""
238
+ result: dict[str, Any] = {"name": self.name}
239
+
240
+ if self.default_chunking:
241
+ result["default_chunking"] = self.default_chunking
242
+
243
+ if self.default_quality_filter:
244
+ result["default_quality_filter"] = self.default_quality_filter
245
+
246
+ if self.patterns:
247
+ result["patterns"] = [p.to_dict() for p in self.patterns]
248
+
249
+ if self.exclude_patterns:
250
+ result["exclude_patterns"] = self.exclude_patterns
251
+
252
+ if self.default_metadata:
253
+ result["default_metadata"] = self.default_metadata
254
+
255
+ return result
256
+
257
+ def get_pattern_config(self, filepath: str | Path) -> FilePatternConfig | None:
258
+ """Get the pattern config that matches a file path.
259
+
260
+ Returns the first matching pattern config, or None if no pattern matches.
261
+ Patterns are checked in order, so more specific patterns should come first.
262
+
263
+ Args:
264
+ filepath: Path to check (relative to knowledge base root)
265
+
266
+ Returns:
267
+ Matching FilePatternConfig, or None
268
+ """
269
+ filepath = Path(filepath)
270
+
271
+ for pattern_config in self.patterns:
272
+ if pattern_config.enabled and self._matches_pattern(filepath, pattern_config.pattern):
273
+ return pattern_config
274
+
275
+ return None
276
+
277
+ def is_excluded(self, filepath: str | Path) -> bool:
278
+ """Check if a file path matches any exclude pattern.
279
+
280
+ Args:
281
+ filepath: Path to check (relative to knowledge base root)
282
+
283
+ Returns:
284
+ True if file should be excluded
285
+ """
286
+ filepath = Path(filepath)
287
+
288
+ for pattern in self.exclude_patterns:
289
+ if self._matches_pattern(filepath, pattern):
290
+ return True
291
+
292
+ return False
293
+
294
+ def _matches_pattern(self, filepath: Path, pattern: str) -> bool:
295
+ """Check if a filepath matches a glob pattern.
296
+
297
+ Handles both fnmatch-style and glob-style patterns including `**`.
298
+
299
+ Args:
300
+ filepath: Path to check
301
+ pattern: Glob pattern
302
+
303
+ Returns:
304
+ True if path matches pattern
305
+ """
306
+ from fnmatch import fnmatch
307
+
308
+ filepath_str = str(filepath)
309
+
310
+ # Handle ** patterns by using Path.match for recursive matching
311
+ if "**" in pattern:
312
+ # Path.match handles ** as recursive glob
313
+ return filepath.match(pattern)
314
+ else:
315
+ # Use fnmatch for simple patterns
316
+ return fnmatch(filepath_str, pattern)
317
+
318
+ def get_chunking_config(self, filepath: str | Path) -> dict[str, Any]:
319
+ """Get the effective chunking config for a file.
320
+
321
+ Merges default chunking with any pattern-specific overrides.
322
+
323
+ Args:
324
+ filepath: Path to file
325
+
326
+ Returns:
327
+ Merged chunking configuration
328
+ """
329
+ config = self.default_chunking.copy()
330
+
331
+ pattern_config = self.get_pattern_config(filepath)
332
+ if pattern_config and pattern_config.chunking:
333
+ config.update(pattern_config.chunking)
334
+
335
+ return config
336
+
337
+ def get_metadata(self, filepath: str | Path) -> dict[str, Any]:
338
+ """Get the effective metadata for a file.
339
+
340
+ Includes default metadata plus source file info.
341
+
342
+ Args:
343
+ filepath: Path to file
344
+
345
+ Returns:
346
+ Metadata dictionary
347
+ """
348
+ filepath = Path(filepath)
349
+ metadata = self.default_metadata.copy()
350
+ metadata["source"] = str(filepath)
351
+ metadata["filename"] = filepath.name
352
+ return metadata