dataknobs-xization 1.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,591 @@
1
+ """JSON chunker for generating RAG-optimized chunks from JSON data.
2
+
3
+ This module provides functionality to chunk JSON data (objects, arrays, JSONL files)
4
+ into units suitable for RAG (Retrieval-Augmented Generation) applications, with
5
+ preserved metadata and configurable text generation.
6
+
7
+ Supports both in-memory and streaming modes for handling large files.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import re
14
+ from dataclasses import dataclass
15
+ from pathlib import Path
16
+ from typing import Any, Iterator, Literal
17
+
18
+ # Patterns for detecting technical/non-text fields
19
+ UUID_PATTERN = re.compile(
20
+ r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", re.IGNORECASE
21
+ )
22
+ BASE64_PATTERN = re.compile(r"^[A-Za-z0-9+/]{20,}={0,2}$")
23
+ TIMESTAMP_PATTERN = re.compile(
24
+ r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}" # ISO format
25
+ )
26
+
27
+ # Field names commonly containing text content
28
+ TEXT_FIELD_NAMES = frozenset({
29
+ "title", "name", "description", "content", "text", "summary",
30
+ "body", "message", "comment", "note", "notes", "abstract",
31
+ "overview", "details", "explanation", "definition", "label",
32
+ })
33
+
34
+ # Field names to skip (technical/metadata)
35
+ SKIP_FIELD_NAMES = frozenset({
36
+ "id", "uuid", "guid", "_id", "created_at", "updated_at",
37
+ "created", "updated", "timestamp", "modified", "hash",
38
+ "checksum", "signature", "token", "key", "secret",
39
+ })
40
+
41
+
42
+ @dataclass
43
+ class JSONChunkConfig:
44
+ """Configuration for JSON chunking.
45
+
46
+ Attributes:
47
+ max_chunk_size: Maximum size of generated text in characters
48
+ text_template: Optional Jinja2 template for text generation (overrides auto-flatten)
49
+ text_fields: Specific fields to use for text (None = auto-detect)
50
+ nested_separator: Separator for flattened nested keys (e.g., "config.database.host")
51
+ array_handling: How to handle arrays - expand into multiple chunks, join values, or take first
52
+ include_field_names: Whether to include field names in generated text
53
+ skip_technical_fields: Whether to skip UUIDs, timestamps, base64 in text generation
54
+ """
55
+
56
+ max_chunk_size: int = 1000
57
+ text_template: str | None = None
58
+ text_fields: list[str] | None = None
59
+ nested_separator: str = "."
60
+ array_handling: Literal["expand", "join", "first"] = "expand"
61
+ include_field_names: bool = True
62
+ skip_technical_fields: bool = True
63
+
64
+
65
+ @dataclass
66
+ class JSONChunk:
67
+ """A chunk generated from JSON data.
68
+
69
+ Attributes:
70
+ text: Generated embeddable text
71
+ metadata: All original fields (flattened for nested objects)
72
+ source_path: JSON path to this chunk's source (e.g., "[0].products[2]")
73
+ source_file: Original file path (if from file)
74
+ embedding_text: Enriched text optimized for embedding
75
+ chunk_index: Index of this chunk in the sequence
76
+ """
77
+
78
+ text: str
79
+ metadata: dict[str, Any]
80
+ source_path: str = ""
81
+ source_file: str = ""
82
+ embedding_text: str = ""
83
+ chunk_index: int = 0
84
+
85
+ def to_dict(self) -> dict[str, Any]:
86
+ """Convert chunk to dictionary representation."""
87
+ return {
88
+ "text": self.text,
89
+ "metadata": self.metadata,
90
+ "source_path": self.source_path,
91
+ "source_file": self.source_file,
92
+ "embedding_text": self.embedding_text,
93
+ "chunk_index": self.chunk_index,
94
+ }
95
+
96
+
97
+ class JSONChunker:
98
+ """Chunker for generating chunks from JSON data with preserved metadata.
99
+
100
+ Supports both in-memory processing and streaming for large files.
101
+
102
+ Example:
103
+ >>> chunker = JSONChunker()
104
+ >>> # In-memory processing
105
+ >>> chunks = chunker.chunk({"title": "Hello", "content": "World"})
106
+ >>> # Streaming from file
107
+ >>> for chunk in chunker.stream_chunks("large_data.jsonl"):
108
+ ... process(chunk)
109
+ """
110
+
111
+ def __init__(self, config: JSONChunkConfig | None = None):
112
+ """Initialize the JSON chunker.
113
+
114
+ Args:
115
+ config: Configuration for chunking behavior
116
+ """
117
+ self.config = config or JSONChunkConfig()
118
+ self._chunk_index = 0
119
+ self._jinja_env: Any = None # Lazy loaded
120
+
121
+ def chunk(
122
+ self,
123
+ data: dict[str, Any] | list[dict[str, Any]],
124
+ source: str = "",
125
+ ) -> list[JSONChunk]:
126
+ """Chunk in-memory JSON data.
127
+
128
+ Args:
129
+ data: JSON object or array of objects to chunk
130
+ source: Optional source identifier (e.g., file path)
131
+
132
+ Returns:
133
+ List of JSONChunk objects
134
+ """
135
+ self._chunk_index = 0
136
+
137
+ if isinstance(data, dict):
138
+ return [self._process_item(data, source_path="", source_file=source)]
139
+ elif isinstance(data, list):
140
+ chunks = []
141
+ for idx, item in enumerate(data):
142
+ if isinstance(item, dict):
143
+ chunks.append(
144
+ self._process_item(item, source_path=f"[{idx}]", source_file=source)
145
+ )
146
+ return chunks
147
+ else:
148
+ raise ValueError(f"Expected dict or list, got {type(data).__name__}")
149
+
150
+ def stream_chunks(
151
+ self,
152
+ source: str | Path,
153
+ timeout: int = 10,
154
+ ) -> Iterator[JSONChunk]:
155
+ """Stream chunks from large JSON files without loading into memory.
156
+
157
+ Leverages dataknobs_utils.json_utils streaming infrastructure.
158
+
159
+ Supported formats:
160
+ - JSON arrays: Each top-level element becomes a chunk source
161
+ - JSONL files: Each line is a complete JSON object
162
+ - Compressed files: .gz files auto-detected and decompressed
163
+ - URLs: Remote JSON fetched with streaming
164
+
165
+ Args:
166
+ source: File path, URL, or JSON string
167
+ timeout: Request timeout for URLs (seconds)
168
+
169
+ Yields:
170
+ JSONChunk objects as they are processed
171
+ """
172
+ source_str = str(source)
173
+ self._chunk_index = 0
174
+
175
+ # Detect format and process accordingly
176
+ if self._is_jsonl_file(source_str):
177
+ yield from self._stream_jsonl(source_str)
178
+ else:
179
+ yield from self._stream_json_array(source_str, timeout)
180
+
181
+ def _is_jsonl_file(self, source: str) -> bool:
182
+ """Check if source is a JSONL file based on extension."""
183
+ lower = source.lower()
184
+ return (
185
+ lower.endswith(".jsonl")
186
+ or lower.endswith(".jsonl.gz")
187
+ or lower.endswith(".ndjson")
188
+ or lower.endswith(".ndjson.gz")
189
+ )
190
+
191
+ def _stream_jsonl(self, source: str) -> Iterator[JSONChunk]:
192
+ """Stream from a JSONL file (one JSON object per line)."""
193
+ import gzip
194
+
195
+ source_path = Path(source)
196
+
197
+ # Handle gzip
198
+ def open_gzip(p: Path) -> Any:
199
+ return gzip.open(p, "rt", encoding="utf-8")
200
+
201
+ def open_text(p: Path) -> Any:
202
+ return open(p, encoding="utf-8")
203
+
204
+ opener = open_gzip if source.lower().endswith(".gz") else open_text
205
+
206
+ with opener(source_path) as f:
207
+ for line_num, line in enumerate(f):
208
+ line = line.strip()
209
+ if not line:
210
+ continue
211
+ try:
212
+ item = json.loads(line)
213
+ if isinstance(item, dict):
214
+ yield self._process_item(
215
+ item,
216
+ source_path=f"[{line_num}]",
217
+ source_file=source,
218
+ )
219
+ except json.JSONDecodeError:
220
+ continue # Skip malformed lines
221
+
222
+ def _stream_json_array(self, source: str, timeout: int) -> Iterator[JSONChunk]:
223
+ """Stream from a JSON array file using json_utils infrastructure."""
224
+ try:
225
+ from dataknobs_utils.json_utils import (
226
+ stream_json_data,
227
+ PathSorter,
228
+ ArrayElementAcceptStrategy,
229
+ Path as JsonPath,
230
+ build_jq_path,
231
+ )
232
+ except ImportError:
233
+ # Fall back to loading entire file if streaming utils not available
234
+ yield from self._fallback_load(source)
235
+ return
236
+
237
+ # Use PathSorter to group paths into records
238
+ sorter = PathSorter(
239
+ ArrayElementAcceptStrategy(max_array_level=0),
240
+ max_groups=2,
241
+ )
242
+
243
+ item_num = 0
244
+
245
+ def visitor(item: Any, path: tuple[Any, ...]) -> None:
246
+ nonlocal item_num
247
+ jq_path = build_jq_path(path, keep_list_idxs=True)
248
+ sorter.add_path(JsonPath(jq_path, item, line_num=item_num))
249
+ item_num += 1
250
+
251
+ stream_json_data(source, visitor, timeout=timeout)
252
+
253
+ # Process collected groups
254
+ if sorter.groups:
255
+ for group in sorter.groups:
256
+ sorter.close_group(check_size=False)
257
+ record_dict = group.as_dict()
258
+ # Handle array at root level
259
+ if isinstance(record_dict, dict) and len(record_dict) == 1:
260
+ root_key = next(iter(record_dict.keys()))
261
+ items = record_dict[root_key]
262
+ if isinstance(items, list):
263
+ for idx, item in enumerate(items):
264
+ if isinstance(item, dict):
265
+ yield self._process_item(
266
+ item,
267
+ source_path=f".{root_key}[{idx}]",
268
+ source_file=source,
269
+ )
270
+
271
+ def _fallback_load(self, source: str) -> Iterator[JSONChunk]:
272
+ """Fallback: load entire file when streaming utils unavailable."""
273
+ import gzip
274
+ from pathlib import Path
275
+
276
+ source_path = Path(source)
277
+ if not source_path.exists():
278
+ return
279
+
280
+ if source.lower().endswith(".gz"):
281
+ with gzip.open(source_path, "rt", encoding="utf-8") as f:
282
+ data = json.load(f)
283
+ else:
284
+ with open(source_path, encoding="utf-8") as f:
285
+ data = json.load(f)
286
+
287
+ yield from self.chunk(data, source=source)
288
+
289
+ def _process_item(
290
+ self,
291
+ item: dict[str, Any],
292
+ source_path: str,
293
+ source_file: str,
294
+ ) -> JSONChunk:
295
+ """Process a single JSON object into a chunk.
296
+
297
+ Args:
298
+ item: JSON object to process
299
+ source_path: JSON path to this item
300
+ source_file: Source file path
301
+
302
+ Returns:
303
+ JSONChunk with generated text and preserved metadata
304
+ """
305
+ # Flatten nested structure for metadata
306
+ flat_metadata = self._flatten(item)
307
+
308
+ # Generate text
309
+ if self.config.text_template:
310
+ text = self._render_template(item)
311
+ else:
312
+ text = self._auto_generate_text(item)
313
+
314
+ # Truncate if needed
315
+ if len(text) > self.config.max_chunk_size:
316
+ text = text[: self.config.max_chunk_size - 3] + "..."
317
+
318
+ # Generate embedding text (enriched with context)
319
+ embedding_text = self._build_embedding_text(item, text)
320
+
321
+ chunk = JSONChunk(
322
+ text=text,
323
+ metadata=flat_metadata,
324
+ source_path=source_path,
325
+ source_file=source_file,
326
+ embedding_text=embedding_text,
327
+ chunk_index=self._chunk_index,
328
+ )
329
+ self._chunk_index += 1
330
+ return chunk
331
+
332
+ def _flatten(
333
+ self,
334
+ obj: dict[str, Any],
335
+ prefix: str = "",
336
+ ) -> dict[str, Any]:
337
+ """Flatten nested dict/list structure using dot notation.
338
+
339
+ Args:
340
+ obj: Object to flatten
341
+ prefix: Current key prefix
342
+
343
+ Returns:
344
+ Flattened dictionary
345
+ """
346
+ result: dict[str, Any] = {}
347
+ sep = self.config.nested_separator
348
+
349
+ for key, value in obj.items():
350
+ full_key = f"{prefix}{sep}{key}" if prefix else key
351
+
352
+ if isinstance(value, dict):
353
+ result.update(self._flatten(value, full_key))
354
+ elif isinstance(value, list):
355
+ if value and isinstance(value[0], dict):
356
+ # List of objects - store count and flatten first
357
+ result[f"{full_key}._count"] = len(value)
358
+ if value:
359
+ result.update(self._flatten(value[0], f"{full_key}[0]"))
360
+ else:
361
+ # List of primitives - store as-is
362
+ result[full_key] = value
363
+ else:
364
+ result[full_key] = value
365
+
366
+ return result
367
+
368
+ def _auto_generate_text(self, item: dict[str, Any]) -> str:
369
+ """Auto-generate embeddable text from JSON object.
370
+
371
+ Algorithm:
372
+ 1. Extract title/name/id field as primary identifier
373
+ 2. Concatenate text-like fields (description, content, summary)
374
+ 3. Format nested objects with field names
375
+ 4. Handle arrays based on config
376
+
377
+ Args:
378
+ item: JSON object to convert to text
379
+
380
+ Returns:
381
+ Generated text string
382
+ """
383
+ parts: list[str] = []
384
+
385
+ # Use specific fields if configured
386
+ if self.config.text_fields:
387
+ for field_name in self.config.text_fields:
388
+ value = self._get_nested_value(item, field_name)
389
+ if value is not None:
390
+ parts.append(self._format_value(field_name, value))
391
+ return "\n".join(parts)
392
+
393
+ # Auto-detect: prioritize known text fields
394
+ used_keys: set[str] = set()
395
+
396
+ # First pass: extract primary identifier
397
+ for key in ["title", "name", "label"]:
398
+ if key in item and isinstance(item[key], str):
399
+ parts.append(item[key])
400
+ used_keys.add(key)
401
+ break
402
+
403
+ # Second pass: extract text content fields
404
+ for key, value in item.items():
405
+ if key in used_keys:
406
+ continue
407
+ lower_key = key.lower()
408
+ if lower_key in TEXT_FIELD_NAMES:
409
+ if isinstance(value, str) and value.strip():
410
+ if not self._is_technical_value(value):
411
+ if self.config.include_field_names and key not in ("content", "text", "body"):
412
+ parts.append(f"{key}: {value}")
413
+ else:
414
+ parts.append(value)
415
+ used_keys.add(key)
416
+
417
+ # Third pass: include other non-technical fields
418
+ for key, value in item.items():
419
+ if key in used_keys:
420
+ continue
421
+ lower_key = key.lower()
422
+ if lower_key in SKIP_FIELD_NAMES:
423
+ continue
424
+ if key.startswith("_"):
425
+ continue
426
+
427
+ formatted = self._format_value(key, value)
428
+ if formatted:
429
+ parts.append(formatted)
430
+
431
+ return "\n".join(parts)
432
+
433
+ def _format_value(self, key: str, value: Any, depth: int = 0) -> str:
434
+ """Format a value for text generation.
435
+
436
+ Args:
437
+ key: Field name
438
+ value: Field value
439
+ depth: Nesting depth (for indentation)
440
+
441
+ Returns:
442
+ Formatted string
443
+ """
444
+ if value is None:
445
+ return ""
446
+
447
+ if isinstance(value, str):
448
+ if self.config.skip_technical_fields and self._is_technical_value(value):
449
+ return ""
450
+ if self.config.include_field_names:
451
+ return f"{key}: {value}"
452
+ return value
453
+
454
+ if isinstance(value, bool):
455
+ if self.config.include_field_names:
456
+ return f"{key}: {'yes' if value else 'no'}"
457
+ return "yes" if value else "no"
458
+
459
+ if isinstance(value, (int, float)):
460
+ if self.config.include_field_names:
461
+ return f"{key}: {value}"
462
+ return str(value)
463
+
464
+ if isinstance(value, list):
465
+ if not value:
466
+ return ""
467
+ if isinstance(value[0], dict):
468
+ # List of objects - summarize
469
+ return f"{key}: {len(value)} items"
470
+ # List of primitives
471
+ if self.config.array_handling == "join":
472
+ joined = ", ".join(str(v) for v in value[:10])
473
+ if len(value) > 10:
474
+ joined += f"... ({len(value)} total)"
475
+ if self.config.include_field_names:
476
+ return f"{key}: {joined}"
477
+ return joined
478
+ elif self.config.array_handling == "first":
479
+ return self._format_value(key, value[0], depth)
480
+ # "expand" - return all items
481
+ items = [str(v) for v in value]
482
+ if self.config.include_field_names:
483
+ return f"{key}: {', '.join(items)}"
484
+ return ", ".join(items)
485
+
486
+ if isinstance(value, dict):
487
+ # Nested object - format recursively
488
+ sub_parts = []
489
+ for k, v in value.items():
490
+ formatted = self._format_value(k, v, depth + 1)
491
+ if formatted:
492
+ sub_parts.append(formatted)
493
+ if sub_parts:
494
+ if self.config.include_field_names:
495
+ return f"{key}: {'; '.join(sub_parts)}"
496
+ return "; ".join(sub_parts)
497
+ return ""
498
+
499
+ return ""
500
+
501
+ def _is_technical_value(self, value: str) -> bool:
502
+ """Check if a string value appears to be technical/non-text."""
503
+ if not self.config.skip_technical_fields:
504
+ return False
505
+
506
+ if len(value) < 10:
507
+ return False
508
+
509
+ if UUID_PATTERN.match(value):
510
+ return True
511
+ if BASE64_PATTERN.match(value) and len(value) > 50:
512
+ return True
513
+ if TIMESTAMP_PATTERN.match(value):
514
+ return True
515
+
516
+ return False
517
+
518
+ def _get_nested_value(self, obj: dict[str, Any], path: str) -> Any:
519
+ """Get a value from a nested dict using dot notation path.
520
+
521
+ Args:
522
+ obj: Object to traverse
523
+ path: Dot-notation path (e.g., "config.database.host")
524
+
525
+ Returns:
526
+ Value at path, or None if not found
527
+ """
528
+ parts = path.split(self.config.nested_separator)
529
+ current: Any = obj
530
+
531
+ for part in parts:
532
+ if isinstance(current, dict) and part in current:
533
+ current = current[part]
534
+ else:
535
+ return None
536
+
537
+ return current
538
+
539
+ def _render_template(self, item: dict[str, Any]) -> str:
540
+ """Render text using Jinja2 template.
541
+
542
+ Args:
543
+ item: JSON object to render
544
+
545
+ Returns:
546
+ Rendered text string
547
+ """
548
+ if self._jinja_env is None:
549
+ try:
550
+ from jinja2 import Environment
551
+ self._jinja_env = Environment()
552
+ except ImportError as err:
553
+ raise ImportError(
554
+ "jinja2 is required for template-based text generation. "
555
+ "Install it with: pip install jinja2"
556
+ ) from err
557
+
558
+ template = self._jinja_env.from_string(self.config.text_template)
559
+ return template.render(**item)
560
+
561
+ def _build_embedding_text(self, item: dict[str, Any], base_text: str) -> str:
562
+ """Build enriched text optimized for embedding.
563
+
564
+ Adds context that improves semantic search quality.
565
+
566
+ Args:
567
+ item: Original JSON object
568
+ base_text: Generated base text
569
+
570
+ Returns:
571
+ Enriched text for embedding
572
+ """
573
+ parts = []
574
+
575
+ # Add type/category context if available
576
+ for key in ["type", "category", "kind", "class"]:
577
+ if key in item and isinstance(item[key], str):
578
+ parts.append(f"[{item[key].upper()}]")
579
+ break
580
+
581
+ parts.append(base_text)
582
+
583
+ # Add tags/keywords if available
584
+ for key in ["tags", "keywords", "labels"]:
585
+ if key in item and isinstance(item[key], list):
586
+ tags = [str(t) for t in item[key][:5] if isinstance(t, str)]
587
+ if tags:
588
+ parts.append(f"Tags: {', '.join(tags)}")
589
+ break
590
+
591
+ return " ".join(parts)