dataknobs-xization 1.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,367 @@
1
+ """Directory processor for knowledge base ingestion.
2
+
3
+ This module provides the DirectoryProcessor class for processing
4
+ documents from a directory into chunks ready for embedding.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ import os
11
+ from dataclasses import dataclass, field
12
+ from pathlib import Path
13
+ from typing import Any, Iterator, Literal
14
+
15
+ from dataknobs_xization.ingestion.config import (
16
+ FilePatternConfig,
17
+ KnowledgeBaseConfig,
18
+ )
19
+ from dataknobs_xization.json import JSONChunk, JSONChunkConfig, JSONChunker
20
+ from dataknobs_xization.markdown import (
21
+ ChunkQualityConfig,
22
+ HeadingInclusion,
23
+ chunk_markdown_tree,
24
+ parse_markdown,
25
+ )
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ @dataclass
31
+ class ProcessedDocument:
32
+ """A processed document ready for embedding and storage.
33
+
34
+ Contains chunks from a single source file along with metadata
35
+ about the processing.
36
+
37
+ Attributes:
38
+ source_file: Path to the source file
39
+ document_type: Type of document (markdown, json, jsonl)
40
+ chunks: List of processed chunks
41
+ metadata: Document-level metadata
42
+ errors: Any errors encountered during processing
43
+ """
44
+
45
+ source_file: str
46
+ document_type: Literal["markdown", "json", "jsonl"]
47
+ chunks: list[dict[str, Any]]
48
+ metadata: dict[str, Any] = field(default_factory=dict)
49
+ errors: list[str] = field(default_factory=list)
50
+
51
+ @property
52
+ def chunk_count(self) -> int:
53
+ """Number of chunks in this document."""
54
+ return len(self.chunks)
55
+
56
+ @property
57
+ def has_errors(self) -> bool:
58
+ """Whether processing encountered errors."""
59
+ return len(self.errors) > 0
60
+
61
+
62
+ # File size threshold for streaming (10MB)
63
+ STREAMING_THRESHOLD_BYTES = 10 * 1024 * 1024
64
+
65
+ # Config file names to always exclude from processing
66
+ CONFIG_FILE_NAMES = {"knowledge_base.json", "knowledge_base.yaml", "knowledge_base.yml"}
67
+
68
+
69
+ class DirectoryProcessor:
70
+ """Process documents from a directory for knowledge base ingestion.
71
+
72
+ Handles markdown and JSON files with configurable chunking,
73
+ supporting both in-memory and streaming processing for large files.
74
+
75
+ Attributes:
76
+ config: Knowledge base configuration
77
+ root_dir: Root directory for processing
78
+ """
79
+
80
+ def __init__(self, config: KnowledgeBaseConfig, root_dir: str | Path):
81
+ """Initialize the directory processor.
82
+
83
+ Args:
84
+ config: Knowledge base configuration
85
+ root_dir: Root directory containing documents
86
+ """
87
+ self.config = config
88
+ self.root_dir = Path(root_dir)
89
+
90
+ def process(self) -> Iterator[ProcessedDocument]:
91
+ """Process all documents in the directory.
92
+
93
+ Yields ProcessedDocument for each file, automatically using
94
+ streaming for large JSON files to avoid memory exhaustion.
95
+
96
+ Yields:
97
+ ProcessedDocument for each processed file
98
+ """
99
+ # Collect all files first
100
+ files = self._collect_files()
101
+
102
+ for filepath in files:
103
+ rel_path = filepath.relative_to(self.root_dir)
104
+
105
+ # Skip config files
106
+ if filepath.name in CONFIG_FILE_NAMES:
107
+ logger.debug(f"Skipping config file: {rel_path}")
108
+ continue
109
+
110
+ # Skip excluded files
111
+ if self.config.is_excluded(rel_path):
112
+ logger.debug(f"Skipping excluded file: {rel_path}")
113
+ continue
114
+
115
+ # Get pattern config if any
116
+ pattern_config = self.config.get_pattern_config(rel_path)
117
+
118
+ # Process based on file type
119
+ suffix = filepath.suffix.lower()
120
+ if suffix == ".md":
121
+ yield from self._process_markdown(filepath, pattern_config)
122
+ elif suffix in (".json", ".jsonl", ".ndjson"):
123
+ yield from self._process_json(filepath, pattern_config)
124
+ elif suffix == ".gz":
125
+ # Check inner extension for compressed files
126
+ inner_suffix = Path(filepath.stem).suffix.lower()
127
+ if inner_suffix in (".json", ".jsonl", ".ndjson"):
128
+ yield from self._process_json(filepath, pattern_config)
129
+ else:
130
+ logger.debug(f"Skipping unsupported compressed file: {rel_path}")
131
+ else:
132
+ logger.debug(f"Skipping unsupported file type: {rel_path}")
133
+
134
+ def _collect_files(self) -> list[Path]:
135
+ """Collect all files to process from the directory.
136
+
137
+ Returns:
138
+ List of file paths
139
+ """
140
+ files = []
141
+
142
+ # If patterns are defined, use them to find files
143
+ if self.config.patterns:
144
+ for pattern_config in self.config.patterns:
145
+ if pattern_config.enabled:
146
+ for filepath in self.root_dir.glob(pattern_config.pattern):
147
+ if filepath.is_file() and filepath not in files:
148
+ files.append(filepath)
149
+ else:
150
+ # Default: find all supported files
151
+ for pattern in ["**/*.md", "**/*.json", "**/*.jsonl", "**/*.ndjson",
152
+ "**/*.json.gz", "**/*.jsonl.gz", "**/*.ndjson.gz"]:
153
+ for filepath in self.root_dir.glob(pattern):
154
+ if filepath.is_file() and filepath not in files:
155
+ files.append(filepath)
156
+
157
+ return sorted(files)
158
+
159
+ def _process_markdown(
160
+ self,
161
+ filepath: Path,
162
+ pattern_config: FilePatternConfig | None,
163
+ ) -> Iterator[ProcessedDocument]:
164
+ """Process a markdown file.
165
+
166
+ Args:
167
+ filepath: Path to markdown file
168
+ pattern_config: Optional pattern-specific configuration
169
+
170
+ Yields:
171
+ ProcessedDocument for the file
172
+ """
173
+ errors: list[str] = []
174
+ chunks: list[dict[str, Any]] = []
175
+
176
+ try:
177
+ # Read file
178
+ with open(filepath, encoding="utf-8") as f:
179
+ content = f.read()
180
+
181
+ # Get chunking config
182
+ chunking_config = self.config.get_chunking_config(
183
+ filepath.relative_to(self.root_dir)
184
+ )
185
+
186
+ # Build quality filter if configured
187
+ quality_filter = None
188
+ if self.config.default_quality_filter:
189
+ quality_filter = ChunkQualityConfig(**self.config.default_quality_filter)
190
+
191
+ # Parse and chunk
192
+ tree = parse_markdown(content)
193
+ md_chunks = chunk_markdown_tree(
194
+ tree,
195
+ max_chunk_size=chunking_config.get("max_chunk_size", 500),
196
+ chunk_overlap=chunking_config.get("chunk_overlap", 50),
197
+ heading_inclusion=HeadingInclusion.IN_METADATA,
198
+ combine_under_heading=chunking_config.get("combine_under_heading", True),
199
+ quality_filter=quality_filter,
200
+ generate_embeddings=True,
201
+ )
202
+
203
+ # Convert to dictionaries
204
+ for i, chunk in enumerate(md_chunks):
205
+ chunk_dict = {
206
+ "text": chunk.text,
207
+ "embedding_text": chunk.metadata.embedding_text or chunk.text,
208
+ "chunk_index": i,
209
+ "source_path": "",
210
+ "metadata": {
211
+ "heading_path": chunk.metadata.heading_display or chunk.metadata.get_heading_path(),
212
+ "headings": chunk.metadata.headings,
213
+ "heading_levels": chunk.metadata.heading_levels,
214
+ "line_number": chunk.metadata.line_number,
215
+ "chunk_size": chunk.metadata.chunk_size,
216
+ },
217
+ }
218
+ chunks.append(chunk_dict)
219
+
220
+ except Exception as e:
221
+ errors.append(f"Failed to process markdown: {e}")
222
+ logger.error(f"Error processing {filepath}: {e}")
223
+
224
+ # Build metadata
225
+ metadata = self.config.get_metadata(filepath.relative_to(self.root_dir))
226
+
227
+ yield ProcessedDocument(
228
+ source_file=str(filepath),
229
+ document_type="markdown",
230
+ chunks=chunks,
231
+ metadata=metadata,
232
+ errors=errors,
233
+ )
234
+
235
+ def _process_json(
236
+ self,
237
+ filepath: Path,
238
+ pattern_config: FilePatternConfig | None,
239
+ ) -> Iterator[ProcessedDocument]:
240
+ """Process a JSON or JSONL file.
241
+
242
+ Automatically uses streaming for large files or JSONL format.
243
+
244
+ Args:
245
+ filepath: Path to JSON file
246
+ pattern_config: Optional pattern-specific configuration
247
+
248
+ Yields:
249
+ ProcessedDocument for the file
250
+ """
251
+ errors: list[str] = []
252
+ chunks: list[dict[str, Any]] = []
253
+
254
+ try:
255
+ # Build JSON chunker config
256
+ chunking_config = self.config.get_chunking_config(
257
+ filepath.relative_to(self.root_dir)
258
+ )
259
+
260
+ json_config = JSONChunkConfig(
261
+ max_chunk_size=chunking_config.get("max_chunk_size", 1000),
262
+ nested_separator=chunking_config.get("nested_separator", "."),
263
+ array_handling=chunking_config.get("array_handling", "expand"),
264
+ include_field_names=chunking_config.get("include_field_names", True),
265
+ skip_technical_fields=chunking_config.get("skip_technical_fields", True),
266
+ )
267
+
268
+ # Apply pattern-specific overrides
269
+ if pattern_config:
270
+ if pattern_config.text_template:
271
+ json_config.text_template = pattern_config.text_template
272
+ if pattern_config.text_fields:
273
+ json_config.text_fields = pattern_config.text_fields
274
+
275
+ chunker = JSONChunker(json_config)
276
+
277
+ # Determine if we should stream
278
+ is_jsonl = self._is_jsonl_file(str(filepath))
279
+ file_size = os.path.getsize(filepath)
280
+ should_stream = is_jsonl or file_size > STREAMING_THRESHOLD_BYTES
281
+
282
+ if should_stream:
283
+ # Stream chunks for large files or JSONL
284
+ for json_chunk in chunker.stream_chunks(filepath):
285
+ chunk_dict = self._json_chunk_to_dict(json_chunk)
286
+ chunks.append(chunk_dict)
287
+ else:
288
+ # Load and chunk in memory for small files
289
+ import json as json_lib
290
+ with open(filepath, encoding="utf-8") as f:
291
+ data = json_lib.load(f)
292
+
293
+ for json_chunk in chunker.chunk(data, source=str(filepath)):
294
+ chunk_dict = self._json_chunk_to_dict(json_chunk)
295
+ chunks.append(chunk_dict)
296
+
297
+ except Exception as e:
298
+ errors.append(f"Failed to process JSON: {e}")
299
+ logger.error(f"Error processing {filepath}: {e}")
300
+
301
+ # Build metadata
302
+ metadata = self.config.get_metadata(filepath.relative_to(self.root_dir))
303
+
304
+ # Determine document type
305
+ doc_type: Literal["json", "jsonl"] = "jsonl" if self._is_jsonl_file(str(filepath)) else "json"
306
+
307
+ yield ProcessedDocument(
308
+ source_file=str(filepath),
309
+ document_type=doc_type,
310
+ chunks=chunks,
311
+ metadata=metadata,
312
+ errors=errors,
313
+ )
314
+
315
+ def _json_chunk_to_dict(self, chunk: JSONChunk) -> dict[str, Any]:
316
+ """Convert a JSONChunk to a dictionary.
317
+
318
+ Args:
319
+ chunk: JSONChunk instance
320
+
321
+ Returns:
322
+ Dictionary representation
323
+ """
324
+ return {
325
+ "text": chunk.text,
326
+ "embedding_text": chunk.embedding_text or chunk.text,
327
+ "chunk_index": chunk.chunk_index,
328
+ "source_path": chunk.source_path,
329
+ "metadata": chunk.metadata,
330
+ }
331
+
332
+ def _is_jsonl_file(self, filepath: str) -> bool:
333
+ """Check if a file is JSONL format based on extension.
334
+
335
+ Args:
336
+ filepath: Path to check
337
+
338
+ Returns:
339
+ True if file is JSONL format
340
+ """
341
+ filepath_lower = filepath.lower()
342
+ return any(
343
+ filepath_lower.endswith(ext)
344
+ for ext in [".jsonl", ".ndjson", ".jsonl.gz", ".ndjson.gz"]
345
+ )
346
+
347
+
348
+ def process_directory(
349
+ directory: str | Path,
350
+ config: KnowledgeBaseConfig | None = None,
351
+ ) -> Iterator[ProcessedDocument]:
352
+ """Convenience function to process a directory.
353
+
354
+ Args:
355
+ directory: Directory to process
356
+ config: Optional configuration (loads from directory if not provided)
357
+
358
+ Yields:
359
+ ProcessedDocument for each file
360
+ """
361
+ directory = Path(directory)
362
+
363
+ if config is None:
364
+ config = KnowledgeBaseConfig.load(directory)
365
+
366
+ processor = DirectoryProcessor(config, directory)
367
+ yield from processor.process()
@@ -0,0 +1,17 @@
1
+ """JSON chunking utilities for RAG applications.
2
+
3
+ This module provides utilities for chunking JSON data (objects, arrays, JSONL files)
4
+ into units suitable for RAG (Retrieval-Augmented Generation) applications.
5
+ """
6
+
7
+ from dataknobs_xization.json.json_chunker import (
8
+ JSONChunk,
9
+ JSONChunkConfig,
10
+ JSONChunker,
11
+ )
12
+
13
+ __all__ = [
14
+ "JSONChunk",
15
+ "JSONChunkConfig",
16
+ "JSONChunker",
17
+ ]