dataknobs-xization 1.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,570 @@
1
+ """Content transformation utilities for converting various formats to markdown.
2
+
3
+ This module provides tools for converting structured data formats (JSON, YAML, CSV)
4
+ into markdown format suitable for RAG ingestion and chunking.
5
+
6
+ The ContentTransformer supports:
7
+ - Generic conversion that preserves structure through heading hierarchy
8
+ - Custom schemas for specialized formatting of known data structures
9
+ - Nested object and array handling
10
+ - Configurable heading levels and formatting options
11
+
12
+ Example:
13
+ >>> transformer = ContentTransformer()
14
+ >>>
15
+ >>> # Generic conversion
16
+ >>> data = {"name": "Chain of Thought", "description": "Step by step reasoning"}
17
+ >>> markdown = transformer.transform_json(data)
18
+ >>>
19
+ >>> # With custom schema
20
+ >>> transformer.register_schema("pattern", {
21
+ ... "title_field": "name",
22
+ ... "sections": [
23
+ ... {"field": "description", "heading": "Description"},
24
+ ... {"field": "example", "heading": "Example", "format": "code"}
25
+ ... ]
26
+ ... })
27
+ >>> markdown = transformer.transform_json(data, schema="pattern")
28
+ """
29
+
30
+ import csv
31
+ import io
32
+ import json
33
+ import logging
34
+ from pathlib import Path
35
+ from typing import Any
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ class ContentTransformer:
41
+ """Transform structured content into markdown for RAG ingestion.
42
+
43
+ This class converts various data formats (JSON, YAML, CSV) into well-structured
44
+ markdown that can be parsed by MarkdownParser and chunked by MarkdownChunker.
45
+
46
+ The transformer creates markdown with appropriate heading hierarchy so that
47
+ the chunker can create semantic boundaries around logical content units.
48
+
49
+ Attributes:
50
+ schemas: Dictionary of registered custom schemas
51
+ config: Transformer configuration options
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ base_heading_level: int = 2,
57
+ include_field_labels: bool = True,
58
+ code_block_fields: list[str] | None = None,
59
+ list_fields: list[str] | None = None,
60
+ ):
61
+ """Initialize the content transformer.
62
+
63
+ Args:
64
+ base_heading_level: Starting heading level for top-level items (default: 2)
65
+ include_field_labels: Whether to bold field names in output (default: True)
66
+ code_block_fields: Field names that should be rendered as code blocks
67
+ list_fields: Field names that should be rendered as bullet lists
68
+ """
69
+ self.base_heading_level = base_heading_level
70
+ self.include_field_labels = include_field_labels
71
+ self.code_block_fields = set(code_block_fields or ["example", "code", "snippet"])
72
+ self.list_fields = set(list_fields or ["items", "steps", "objectives", "symptoms", "solutions"])
73
+ self.schemas: dict[str, dict[str, Any]] = {}
74
+
75
+ def register_schema(self, name: str, schema: dict[str, Any]) -> None:
76
+ """Register a custom schema for specialized content conversion.
77
+
78
+ Schemas define how to map JSON fields to markdown structure.
79
+
80
+ Args:
81
+ name: Schema identifier
82
+ schema: Schema definition with the following structure:
83
+ - title_field: Field to use as the main heading (required)
84
+ - description_field: Field for intro text (optional)
85
+ - sections: List of section definitions, each with:
86
+ - field: Source field name
87
+ - heading: Section heading text
88
+ - format: "text", "code", "list", or "subsections" (default: "text")
89
+ - language: Code block language (for format="code")
90
+ - metadata_fields: Fields to render as key-value metadata
91
+
92
+ Example:
93
+ >>> transformer.register_schema("pattern", {
94
+ ... "title_field": "name",
95
+ ... "description_field": "description",
96
+ ... "sections": [
97
+ ... {"field": "use_case", "heading": "When to Use"},
98
+ ... {"field": "example", "heading": "Example", "format": "code"}
99
+ ... ],
100
+ ... "metadata_fields": ["category", "difficulty"]
101
+ ... })
102
+ """
103
+ self.schemas[name] = schema
104
+ logger.debug(f"Registered schema: {name}")
105
+
106
+ def transform(
107
+ self,
108
+ content: Any,
109
+ format: str = "json",
110
+ schema: str | None = None,
111
+ title: str | None = None,
112
+ ) -> str:
113
+ """Transform content to markdown.
114
+
115
+ Args:
116
+ content: Content to transform (dict, list, string, or file path)
117
+ format: Content format - "json", "yaml", or "csv"
118
+ schema: Optional schema name for custom conversion
119
+ title: Optional document title
120
+
121
+ Returns:
122
+ Markdown formatted string
123
+
124
+ Raises:
125
+ ValueError: If format is not supported
126
+ """
127
+ if format == "json":
128
+ if isinstance(content, (str, Path)):
129
+ with open(content, encoding="utf-8") as f:
130
+ data = json.load(f)
131
+ else:
132
+ data = content
133
+ return self.transform_json(data, schema=schema, title=title)
134
+ elif format == "yaml":
135
+ return self.transform_yaml(content, schema=schema, title=title)
136
+ elif format == "csv":
137
+ return self.transform_csv(content, title=title)
138
+ else:
139
+ raise ValueError(f"Unsupported format: {format}. Use 'json', 'yaml', or 'csv'.")
140
+
141
+ def transform_json(
142
+ self,
143
+ data: dict[str, Any] | list[Any],
144
+ schema: str | None = None,
145
+ title: str | None = None,
146
+ ) -> str:
147
+ """Transform JSON data to markdown.
148
+
149
+ Args:
150
+ data: JSON data (dict or list)
151
+ schema: Optional schema name for custom conversion
152
+ title: Optional document title
153
+
154
+ Returns:
155
+ Markdown formatted string
156
+ """
157
+ lines: list[str] = []
158
+
159
+ # Add document title if provided
160
+ if title:
161
+ lines.extend([f"# {title}", ""])
162
+
163
+ # Use custom schema if specified
164
+ if schema and schema in self.schemas:
165
+ return self._transform_with_schema(data, self.schemas[schema], title)
166
+
167
+ # Generic transformation
168
+ if isinstance(data, list):
169
+ for item in data:
170
+ if isinstance(item, dict):
171
+ lines.extend(self._transform_dict_generic(item, self.base_heading_level))
172
+ lines.extend(["---", ""])
173
+ else:
174
+ lines.append(f"- {item}")
175
+ lines.append("")
176
+ elif isinstance(data, dict):
177
+ lines.extend(self._transform_dict_generic(data, self.base_heading_level))
178
+ else:
179
+ lines.append(str(data))
180
+
181
+ return "\n".join(lines)
182
+
183
+ def transform_yaml(
184
+ self,
185
+ content: str | Path,
186
+ schema: str | None = None,
187
+ title: str | None = None,
188
+ ) -> str:
189
+ """Transform YAML content to markdown.
190
+
191
+ Args:
192
+ content: YAML string or file path
193
+ schema: Optional schema name for custom conversion
194
+ title: Optional document title
195
+
196
+ Returns:
197
+ Markdown formatted string
198
+
199
+ Raises:
200
+ ImportError: If PyYAML is not installed
201
+ """
202
+ try:
203
+ import yaml
204
+ except ImportError:
205
+ raise ImportError("PyYAML is required for YAML transformation. Install with: pip install pyyaml") from None
206
+
207
+ if isinstance(content, (str, Path)) and Path(content).exists():
208
+ with open(content, encoding="utf-8") as f:
209
+ data = yaml.safe_load(f)
210
+ else:
211
+ data = yaml.safe_load(content)
212
+
213
+ return self.transform_json(data, schema=schema, title=title)
214
+
215
+ def transform_csv(
216
+ self,
217
+ content: str | Path,
218
+ title: str | None = None,
219
+ title_field: str | None = None,
220
+ ) -> str:
221
+ """Transform CSV content to markdown.
222
+
223
+ Each row becomes a section with the first column (or title_field) as heading.
224
+
225
+ Args:
226
+ content: CSV string or file path
227
+ title: Optional document title
228
+ title_field: Column to use as section title (default: first column)
229
+
230
+ Returns:
231
+ Markdown formatted string
232
+ """
233
+ lines: list[str] = []
234
+
235
+ if title:
236
+ lines.extend([f"# {title}", ""])
237
+
238
+ # Read CSV
239
+ if isinstance(content, Path) or (isinstance(content, str) and Path(content).exists()):
240
+ with open(content, encoding="utf-8") as f:
241
+ reader = csv.DictReader(f)
242
+ rows = list(reader)
243
+ else:
244
+ reader = csv.DictReader(io.StringIO(content))
245
+ rows = list(reader)
246
+
247
+ if not rows:
248
+ return "\n".join(lines)
249
+
250
+ # Determine title field
251
+ fieldnames = list(rows[0].keys())
252
+ if title_field and title_field in fieldnames:
253
+ title_col = title_field
254
+ else:
255
+ title_col = fieldnames[0]
256
+
257
+ # Transform each row
258
+ for row in rows:
259
+ row_title = row.get(title_col, "Untitled")
260
+ lines.append(f"{'#' * self.base_heading_level} {row_title}")
261
+ lines.append("")
262
+
263
+ for field, value in row.items():
264
+ if field == title_col or not value:
265
+ continue
266
+
267
+ if self.include_field_labels:
268
+ lines.append(f"**{self._format_field_name(field)}**: {value}")
269
+ else:
270
+ lines.append(value)
271
+ lines.append("")
272
+
273
+ lines.extend(["---", ""])
274
+
275
+ return "\n".join(lines)
276
+
277
+ def _transform_with_schema(
278
+ self,
279
+ data: dict[str, Any] | list[Any],
280
+ schema: dict[str, Any],
281
+ title: str | None = None,
282
+ ) -> str:
283
+ """Transform data using a custom schema.
284
+
285
+ Args:
286
+ data: Data to transform (list or dict)
287
+ - List format: [{"name": "Item", ...}, ...]
288
+ - Dict format: {"Item": {...}, ...} (keys become title_field values)
289
+ schema: Schema definition
290
+ title: Optional document title
291
+
292
+ Returns:
293
+ Markdown formatted string
294
+ """
295
+ lines: list[str] = []
296
+
297
+ if title:
298
+ lines.extend([f"# {title}", ""])
299
+
300
+ # Normalize dict-keyed format to list format
301
+ # Dict format: {"Item Name": {"field": "value"}} -> [{"name": "Item Name", "field": "value"}]
302
+ if isinstance(data, dict):
303
+ # Check if this looks like a keyed dict (values are dicts)
304
+ # vs a single item dict (values are primitive)
305
+ if all(isinstance(v, dict) for v in data.values()):
306
+ title_field = schema.get("title_field", "name")
307
+ data = [
308
+ {title_field: key, **value}
309
+ for key, value in data.items()
310
+ ]
311
+ logger.debug(f"Normalized dict-keyed data to list format with {len(data)} items")
312
+
313
+ items = data if isinstance(data, list) else [data]
314
+
315
+ for item in items:
316
+ if not isinstance(item, dict):
317
+ continue
318
+
319
+ # Title
320
+ title_field = schema.get("title_field", "name")
321
+ item_title = item.get(title_field, "Untitled")
322
+ lines.append(f"{'#' * self.base_heading_level} {item_title}")
323
+ lines.append("")
324
+
325
+ # Metadata fields (rendered as bold key-value pairs)
326
+ metadata_fields = schema.get("metadata_fields", [])
327
+ for field in metadata_fields:
328
+ if item.get(field):
329
+ formatted_name = self._format_field_name(field)
330
+ lines.append(f"**{formatted_name}**: {item[field]}")
331
+ if metadata_fields:
332
+ lines.append("")
333
+
334
+ # Description field (intro text without heading)
335
+ desc_field = schema.get("description_field")
336
+ if desc_field and desc_field in item and item[desc_field]:
337
+ lines.extend([item[desc_field], ""])
338
+
339
+ # Sections
340
+ for section in schema.get("sections", []):
341
+ field = section.get("field")
342
+ if field not in item or not item[field]:
343
+ continue
344
+
345
+ heading = section.get("heading", self._format_field_name(field))
346
+ format_type = section.get("format", "text")
347
+
348
+ lines.append(f"{'#' * (self.base_heading_level + 1)} {heading}")
349
+ lines.append("")
350
+
351
+ value = item[field]
352
+
353
+ if format_type == "code":
354
+ language = section.get("language", "")
355
+ lines.append(f"```{language}")
356
+ lines.append(str(value))
357
+ lines.append("```")
358
+ elif format_type == "list":
359
+ if isinstance(value, list):
360
+ for v in value:
361
+ lines.append(f"- {v}")
362
+ else:
363
+ lines.append(f"- {value}")
364
+ elif format_type == "subsections":
365
+ # For nested objects
366
+ if isinstance(value, dict):
367
+ for k, v in value.items():
368
+ lines.append(f"**{self._format_field_name(k)}**: {v}")
369
+ elif isinstance(value, list):
370
+ for v in value:
371
+ if isinstance(v, dict):
372
+ name = v.get("name", v.get("title", "Item"))
373
+ desc = v.get("description", "")
374
+ lines.append(f"- **{name}**: {desc}")
375
+ else:
376
+ lines.append(f"- {v}")
377
+ else: # text
378
+ lines.append(str(value))
379
+
380
+ lines.append("")
381
+
382
+ lines.extend(["---", ""])
383
+
384
+ return "\n".join(lines)
385
+
386
+ def _transform_dict_generic(
387
+ self,
388
+ data: dict[str, Any],
389
+ heading_level: int,
390
+ ) -> list[str]:
391
+ """Transform a dictionary to markdown using generic rules.
392
+
393
+ Args:
394
+ data: Dictionary to transform
395
+ heading_level: Current heading level
396
+
397
+ Returns:
398
+ List of markdown lines
399
+ """
400
+ lines: list[str] = []
401
+
402
+ # Try to find a title field
403
+ title = None
404
+ title_candidates = ["name", "title", "id", "key"]
405
+ for candidate in title_candidates:
406
+ if candidate in data and isinstance(data[candidate], str):
407
+ title = data[candidate]
408
+ break
409
+
410
+ if title:
411
+ lines.append(f"{'#' * heading_level} {title}")
412
+ lines.append("")
413
+
414
+ # Process fields
415
+ for key, value in data.items():
416
+ # Skip title field if we already used it
417
+ if key in title_candidates and key == title:
418
+ continue
419
+
420
+ if value is None or value == "":
421
+ continue
422
+
423
+ formatted_key = self._format_field_name(key)
424
+
425
+ # Handle different value types
426
+ if isinstance(value, dict):
427
+ # Nested object becomes a subsection
428
+ lines.append(f"{'#' * (heading_level + 1)} {formatted_key}")
429
+ lines.append("")
430
+ lines.extend(self._transform_dict_generic(value, heading_level + 2))
431
+
432
+ elif isinstance(value, list):
433
+ if key in self.list_fields or all(isinstance(v, str) for v in value):
434
+ # Render as bullet list
435
+ lines.append(f"{'#' * (heading_level + 1)} {formatted_key}")
436
+ lines.append("")
437
+ for item in value:
438
+ if isinstance(item, dict):
439
+ # Complex list item
440
+ name = item.get("name", item.get("title", str(item)))
441
+ desc = item.get("description", "")
442
+ if desc:
443
+ lines.append(f"- **{name}**: {desc}")
444
+ else:
445
+ lines.append(f"- {name}")
446
+ else:
447
+ lines.append(f"- {item}")
448
+ lines.append("")
449
+ else:
450
+ # List of complex objects
451
+ lines.append(f"{'#' * (heading_level + 1)} {formatted_key}")
452
+ lines.append("")
453
+ for item in value:
454
+ if isinstance(item, dict):
455
+ lines.extend(self._transform_dict_generic(item, heading_level + 2))
456
+ else:
457
+ lines.append(f"- {item}")
458
+ lines.append("")
459
+
460
+ elif key in self.code_block_fields:
461
+ # Render as code block
462
+ lines.append(f"{'#' * (heading_level + 1)} {formatted_key}")
463
+ lines.append("")
464
+ lines.append("```")
465
+ lines.append(str(value))
466
+ lines.append("```")
467
+ lines.append("")
468
+
469
+ else:
470
+ # Simple value
471
+ if self.include_field_labels:
472
+ lines.append(f"**{formatted_key}**: {value}")
473
+ else:
474
+ lines.append(str(value))
475
+ lines.append("")
476
+
477
+ return lines
478
+
479
+ def _format_field_name(self, field: str) -> str:
480
+ """Format a field name for display.
481
+
482
+ Converts snake_case and camelCase to Title Case.
483
+
484
+ Args:
485
+ field: Field name to format
486
+
487
+ Returns:
488
+ Formatted field name
489
+ """
490
+ # Handle snake_case
491
+ words = field.replace("_", " ").replace("-", " ")
492
+
493
+ # Handle camelCase
494
+ result = []
495
+ for i, char in enumerate(words):
496
+ if char.isupper() and i > 0 and words[i-1].islower():
497
+ result.append(" ")
498
+ result.append(char)
499
+
500
+ return "".join(result).title()
501
+
502
+
503
+ # Convenience function for quick transformations
504
+ def json_to_markdown(
505
+ data: dict[str, Any] | list[Any],
506
+ title: str | None = None,
507
+ base_heading_level: int = 2,
508
+ ) -> str:
509
+ """Convert JSON data to markdown.
510
+
511
+ This is a convenience function that creates a ContentTransformer
512
+ and transforms the data in one call.
513
+
514
+ Args:
515
+ data: JSON data to transform
516
+ title: Optional document title
517
+ base_heading_level: Starting heading level (default: 2)
518
+
519
+ Returns:
520
+ Markdown formatted string
521
+
522
+ Example:
523
+ >>> patterns = [
524
+ ... {"name": "Chain of Thought", "description": "Step by step"},
525
+ ... {"name": "Few-Shot", "description": "Learning from examples"}
526
+ ... ]
527
+ >>> markdown = json_to_markdown(patterns, title="Prompt Patterns")
528
+ """
529
+ transformer = ContentTransformer(base_heading_level=base_heading_level)
530
+ return transformer.transform_json(data, title=title)
531
+
532
+
533
+ def yaml_to_markdown(
534
+ content: str | Path,
535
+ title: str | None = None,
536
+ base_heading_level: int = 2,
537
+ ) -> str:
538
+ """Convert YAML content to markdown.
539
+
540
+ Args:
541
+ content: YAML string or file path
542
+ title: Optional document title
543
+ base_heading_level: Starting heading level (default: 2)
544
+
545
+ Returns:
546
+ Markdown formatted string
547
+ """
548
+ transformer = ContentTransformer(base_heading_level=base_heading_level)
549
+ return transformer.transform_yaml(content, title=title)
550
+
551
+
552
+ def csv_to_markdown(
553
+ content: str | Path,
554
+ title: str | None = None,
555
+ title_field: str | None = None,
556
+ base_heading_level: int = 2,
557
+ ) -> str:
558
+ """Convert CSV content to markdown.
559
+
560
+ Args:
561
+ content: CSV string or file path
562
+ title: Optional document title
563
+ title_field: Column to use as section title
564
+ base_heading_level: Starting heading level (default: 2)
565
+
566
+ Returns:
567
+ Markdown formatted string
568
+ """
569
+ transformer = ContentTransformer(base_heading_level=base_heading_level)
570
+ return transformer.transform_csv(content, title=title, title_field=title_field)
@@ -0,0 +1,27 @@
1
+ """Knowledge base ingestion module.
2
+
3
+ This module provides configuration and processing for ingesting
4
+ documents from a directory into a knowledge base.
5
+ """
6
+
7
+ from dataknobs_xization.ingestion.config import (
8
+ FilePatternConfig,
9
+ IngestionConfigError,
10
+ KnowledgeBaseConfig,
11
+ )
12
+ from dataknobs_xization.ingestion.processor import (
13
+ DirectoryProcessor,
14
+ ProcessedDocument,
15
+ process_directory,
16
+ )
17
+
18
+ __all__ = [
19
+ # Config
20
+ "FilePatternConfig",
21
+ "IngestionConfigError",
22
+ "KnowledgeBaseConfig",
23
+ # Processor
24
+ "DirectoryProcessor",
25
+ "ProcessedDocument",
26
+ "process_directory",
27
+ ]