krira-augment 2.1.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,551 @@
1
+ """
2
+ Data Transformer module for Krira Chunker V2.0.
3
+
4
+ Standardizes diverse input formats into Markdown for optimal chunking.
5
+ Converts HTML tables, JSON objects, and raw CSV into clean Markdown format.
6
+
7
+ Performance: O(1) memory usage with streaming support for large files.
8
+ """
9
+
10
+ import csv
11
+ import json
12
+ from dataclasses import dataclass, field
13
+ from io import StringIO
14
+ from typing import Any, Dict, Generator, List, Literal, Optional, Union
15
+
16
+
17
+ @dataclass
18
+ class TransformConfig:
19
+ """
20
+ Configuration for DataTransformer.
21
+
22
+ This dataclass controls transformation behaviors including output format,
23
+ table handling, and JSON flattening depth.
24
+
25
+ Attributes:
26
+ output_format: Target format for transformation ("markdown" or "plain_text").
27
+ preserve_tables: Convert tables to Markdown format instead of flattening.
28
+ max_table_columns: Maximum columns to preserve in tables.
29
+ json_indent: Add indentation when flattening JSON.
30
+ max_json_depth: Maximum nesting depth to preserve.
31
+ """
32
+
33
+ # === TARGET FORMAT ===
34
+ output_format: Literal["markdown", "plain_text"] = "markdown"
35
+ """Target format for transformation."""
36
+
37
+ # === TABLE HANDLING ===
38
+ preserve_tables: bool = True
39
+ """Convert tables to Markdown table format instead of flattening."""
40
+
41
+ max_table_columns: int = 10
42
+ """Maximum columns to preserve. Wider tables are summarized."""
43
+
44
+ # === JSON HANDLING ===
45
+ json_indent: bool = True
46
+ """Add indentation when flattening JSON."""
47
+
48
+ max_json_depth: int = 3
49
+ """Maximum nesting depth to preserve. Deeper objects are truncated."""
50
+
51
+ def __post_init__(self) -> None:
52
+ """Validate configuration parameters."""
53
+ if self.output_format not in ("markdown", "plain_text"):
54
+ raise ValueError(
55
+ f"output_format must be 'markdown' or 'plain_text', "
56
+ f"got '{self.output_format}'"
57
+ )
58
+ if self.max_table_columns <= 0:
59
+ raise ValueError(
60
+ f"max_table_columns must be positive, got {self.max_table_columns}"
61
+ )
62
+ if self.max_json_depth <= 0:
63
+ raise ValueError(
64
+ f"max_json_depth must be positive, got {self.max_json_depth}"
65
+ )
66
+
67
+
68
+ class DataTransformer:
69
+ """
70
+ Standardizes diverse input formats into Markdown.
71
+
72
+ Converts HTML tables, JSON objects, and raw CSV into clean
73
+ Markdown format for optimal chunking.
74
+
75
+ Features:
76
+ - CSV to Markdown table conversion
77
+ - JSON to Markdown list flattening
78
+ - Nested object support
79
+ - Configurable column limits
80
+ - Plain text fallback
81
+
82
+ Example:
83
+ >>> config = TransformConfig(output_format="markdown")
84
+ >>> transformer = DataTransformer(config)
85
+ >>> md = transformer.csv_to_markdown("Name,Age\\nAlice,30")
86
+ >>> print(md)
87
+ | Name | Age |
88
+ |------|-----|
89
+ | Alice | 30 |
90
+ """
91
+
92
+ def __init__(self, config: TransformConfig) -> None:
93
+ """
94
+ Initialize the transformer with configuration.
95
+
96
+ Args:
97
+ config: Configuration object controlling transformation behavior.
98
+
99
+ Raises:
100
+ TypeError: If config is not a TransformConfig instance.
101
+ """
102
+ if not isinstance(config, TransformConfig):
103
+ raise TypeError(
104
+ f"config must be TransformConfig, got {type(config).__name__}"
105
+ )
106
+
107
+ self.config = config
108
+
109
+ # Statistics tracking
110
+ self._stats = {
111
+ "tables_transformed": 0,
112
+ "json_objects_transformed": 0,
113
+ "rows_processed": 0,
114
+ }
115
+
116
+ def csv_to_markdown(
117
+ self,
118
+ csv_text: str,
119
+ has_header: bool = True
120
+ ) -> str:
121
+ """
122
+ Convert CSV text to Markdown table format.
123
+
124
+ Args:
125
+ csv_text: Raw CSV string.
126
+ has_header: Whether first row is a header.
127
+
128
+ Returns:
129
+ Markdown table string.
130
+
131
+ Example Input:
132
+ "Name,Age\\nAlice,30\\nBob,25"
133
+
134
+ Example Output:
135
+ | Name | Age |
136
+ |------|-----|
137
+ | Alice | 30 |
138
+ | Bob | 25 |
139
+
140
+ Edge Cases:
141
+ - Empty CSV returns "".
142
+ - Cells with commas are handled correctly (using csv.reader).
143
+ - Cells with line breaks are stripped.
144
+ - If column count varies, pads with empty cells.
145
+ """
146
+ if not csv_text or not csv_text.strip():
147
+ return ""
148
+
149
+ try:
150
+ reader = csv.reader(StringIO(csv_text))
151
+ rows = list(reader)
152
+ except csv.Error:
153
+ # If parsing fails, return original text
154
+ return csv_text
155
+
156
+ if not rows:
157
+ return ""
158
+
159
+ # Track statistics
160
+ self._stats["tables_transformed"] += 1
161
+ self._stats["rows_processed"] += len(rows)
162
+
163
+ # Determine max columns (for normalization)
164
+ max_cols = max(len(row) for row in rows) if rows else 0
165
+
166
+ if max_cols == 0:
167
+ return ""
168
+
169
+ # Apply column limit
170
+ effective_cols = min(max_cols, self.config.max_table_columns)
171
+ truncated = max_cols > self.config.max_table_columns
172
+
173
+ # Normalize rows (pad with empty cells if needed)
174
+ normalized_rows = []
175
+ for row in rows:
176
+ # Clean cell contents (strip newlines within cells)
177
+ cleaned_row = [
178
+ str(cell).replace('\n', ' ').replace('\r', '').strip()
179
+ for cell in row[:effective_cols]
180
+ ]
181
+ # Pad if needed
182
+ while len(cleaned_row) < effective_cols:
183
+ cleaned_row.append("")
184
+ normalized_rows.append(cleaned_row)
185
+
186
+ if not normalized_rows:
187
+ return ""
188
+
189
+ if self.config.output_format == "plain_text":
190
+ return self._csv_to_plain_text(normalized_rows, has_header, truncated)
191
+
192
+ # Build Markdown table
193
+ result_lines = []
194
+
195
+ if has_header:
196
+ # First row is header
197
+ header_row = normalized_rows[0]
198
+ data_rows = normalized_rows[1:]
199
+
200
+ # Generate header names if first row is empty
201
+ if not any(cell.strip() for cell in header_row):
202
+ header_row = [f"Column_{i+1}" for i in range(effective_cols)]
203
+ else:
204
+ # Generate column headers
205
+ header_row = [f"Column_{i+1}" for i in range(effective_cols)]
206
+ data_rows = normalized_rows
207
+
208
+ # Add note if truncated
209
+ if truncated:
210
+ result_lines.append(
211
+ f"*Note: Table truncated from {max_cols} to "
212
+ f"{effective_cols} columns*\n"
213
+ )
214
+
215
+ # Calculate column widths for alignment
216
+ col_widths = []
217
+ for i in range(effective_cols):
218
+ max_width = len(header_row[i]) if i < len(header_row) else 0
219
+ for row in data_rows:
220
+ if i < len(row):
221
+ max_width = max(max_width, len(row[i]))
222
+ col_widths.append(max(max_width, 3)) # Minimum width of 3
223
+
224
+ # Build header line
225
+ header_cells = [
226
+ f" {header_row[i].ljust(col_widths[i])} "
227
+ for i in range(effective_cols)
228
+ ]
229
+ result_lines.append("|" + "|".join(header_cells) + "|")
230
+
231
+ # Build separator line
232
+ separator_cells = [
233
+ "-" * (col_widths[i] + 2)
234
+ for i in range(effective_cols)
235
+ ]
236
+ result_lines.append("|" + "|".join(separator_cells) + "|")
237
+
238
+ # Build data rows
239
+ for row in data_rows:
240
+ data_cells = [
241
+ f" {row[i].ljust(col_widths[i]) if i < len(row) else ''.ljust(col_widths[i])} "
242
+ for i in range(effective_cols)
243
+ ]
244
+ result_lines.append("|" + "|".join(data_cells) + "|")
245
+
246
+ return "\n".join(result_lines)
247
+
248
+ def _csv_to_plain_text(
249
+ self,
250
+ rows: List[List[str]],
251
+ has_header: bool,
252
+ truncated: bool
253
+ ) -> str:
254
+ """Convert normalized CSV rows to plain text format."""
255
+ if not rows:
256
+ return ""
257
+
258
+ result_lines = []
259
+
260
+ if truncated:
261
+ result_lines.append("[Table truncated]")
262
+
263
+ if has_header and rows:
264
+ header = rows[0]
265
+ data_rows = rows[1:]
266
+ else:
267
+ header = [f"Column_{i+1}" for i in range(len(rows[0]))]
268
+ data_rows = rows
269
+
270
+ for row in data_rows:
271
+ parts = []
272
+ for i, cell in enumerate(row):
273
+ if cell.strip():
274
+ col_name = header[i] if i < len(header) else f"Column_{i+1}"
275
+ parts.append(f"{col_name}: {cell}")
276
+ if parts:
277
+ result_lines.append(" | ".join(parts))
278
+
279
+ return "\n".join(result_lines)
280
+
281
+ def json_to_markdown(self, json_text: str) -> str:
282
+ """
283
+ Flatten JSON object into Markdown list.
284
+
285
+ Args:
286
+ json_text: JSON string (object or array).
287
+
288
+ Returns:
289
+ Markdown formatted text.
290
+
291
+ Example Input:
292
+ {"user": "Alice", "age": 30, "city": "NYC"}
293
+
294
+ Example Output:
295
+ - **user**: Alice
296
+ - **age**: 30
297
+ - **city**: NYC
298
+
299
+ Edge Cases:
300
+ - Nested objects indent sub-bullets.
301
+ - Arrays are numbered lists.
302
+ - Null values display as "None".
303
+ - Invalid JSON returns original text with warning comment.
304
+ """
305
+ if not json_text or not json_text.strip():
306
+ return ""
307
+
308
+ try:
309
+ data = json.loads(json_text)
310
+ except json.JSONDecodeError as e:
311
+ # Return original text with warning
312
+ return f"<!-- Invalid JSON: {e} -->\n{json_text}"
313
+
314
+ # Track statistics
315
+ self._stats["json_objects_transformed"] += 1
316
+
317
+ if self.config.output_format == "plain_text":
318
+ return self._json_to_plain_text(data)
319
+
320
+ return self._format_json_value(data, depth=0)
321
+
322
+ def _format_json_value(
323
+ self,
324
+ value: Any,
325
+ depth: int = 0,
326
+ is_array_item: bool = False
327
+ ) -> str:
328
+ """
329
+ Recursively format a JSON value as Markdown.
330
+
331
+ Args:
332
+ value: The JSON value to format.
333
+ depth: Current nesting depth.
334
+ is_array_item: Whether this is an array item.
335
+
336
+ Returns:
337
+ Markdown formatted string.
338
+ """
339
+ indent = " " * depth if self.config.json_indent else ""
340
+
341
+ # Handle depth limit
342
+ if depth >= self.config.max_json_depth:
343
+ if isinstance(value, (dict, list)):
344
+ return f"{indent}[...truncated...]"
345
+
346
+ # Handle None/null
347
+ if value is None:
348
+ return "None"
349
+
350
+ # Handle primitives
351
+ if isinstance(value, bool):
352
+ return str(value).lower()
353
+
354
+ if isinstance(value, (int, float)):
355
+ return str(value)
356
+
357
+ if isinstance(value, str):
358
+ return value
359
+
360
+ # Handle arrays
361
+ if isinstance(value, list):
362
+ if not value:
363
+ return "[]"
364
+
365
+ lines = []
366
+ for i, item in enumerate(value, 1):
367
+ if isinstance(item, dict):
368
+ # Nested object in array
369
+ formatted = self._format_json_value(item, depth + 1)
370
+ lines.append(f"{indent}{i}. ")
371
+ # Indent the formatted content
372
+ for line in formatted.split('\n'):
373
+ lines.append(f"{indent} {line}")
374
+ elif isinstance(item, list):
375
+ # Nested array
376
+ formatted = self._format_json_value(item, depth + 1)
377
+ lines.append(f"{indent}{i}. {formatted}")
378
+ else:
379
+ # Primitive value
380
+ formatted = self._format_json_value(item, depth)
381
+ lines.append(f"{indent}{i}. {formatted}")
382
+
383
+ return "\n".join(lines)
384
+
385
+ # Handle objects
386
+ if isinstance(value, dict):
387
+ if not value:
388
+ return "{}"
389
+
390
+ lines = []
391
+ for key, val in value.items():
392
+ if isinstance(val, dict):
393
+ # Nested object
394
+ lines.append(f"{indent}- **{key}**:")
395
+ formatted = self._format_json_value(val, depth + 1)
396
+ for line in formatted.split('\n'):
397
+ lines.append(f" {line}")
398
+ elif isinstance(val, list):
399
+ # Nested array
400
+ lines.append(f"{indent}- **{key}**:")
401
+ formatted = self._format_json_value(val, depth + 1)
402
+ for line in formatted.split('\n'):
403
+ lines.append(f" {line}")
404
+ else:
405
+ # Primitive value
406
+ formatted = self._format_json_value(val, depth)
407
+ lines.append(f"{indent}- **{key}**: {formatted}")
408
+
409
+ return "\n".join(lines)
410
+
411
+ # Fallback for unknown types
412
+ return str(value)
413
+
414
+ def _json_to_plain_text(self, data: Any, depth: int = 0) -> str:
415
+ """Convert JSON data to plain text format."""
416
+ indent = " " * depth if self.config.json_indent else ""
417
+
418
+ if depth >= self.config.max_json_depth:
419
+ if isinstance(data, (dict, list)):
420
+ return f"{indent}[...truncated...]"
421
+
422
+ if data is None:
423
+ return "None"
424
+
425
+ if isinstance(data, bool):
426
+ return str(data).lower()
427
+
428
+ if isinstance(data, (int, float, str)):
429
+ return str(data)
430
+
431
+ if isinstance(data, list):
432
+ if not data:
433
+ return "[]"
434
+ lines = []
435
+ for i, item in enumerate(data, 1):
436
+ formatted = self._json_to_plain_text(item, depth + 1)
437
+ lines.append(f"{indent}{i}. {formatted}")
438
+ return "\n".join(lines)
439
+
440
+ if isinstance(data, dict):
441
+ if not data:
442
+ return "{}"
443
+ lines = []
444
+ for key, val in data.items():
445
+ formatted = self._json_to_plain_text(val, depth + 1)
446
+ if '\n' in formatted:
447
+ lines.append(f"{indent}{key}:")
448
+ lines.append(formatted)
449
+ else:
450
+ lines.append(f"{indent}{key}: {formatted}")
451
+ return "\n".join(lines)
452
+
453
+ return str(data)
454
+
455
+ def transform_row(
456
+ self,
457
+ row: Dict[str, Any],
458
+ format_as: Literal["markdown", "plain_text", "auto"] = "auto"
459
+ ) -> str:
460
+ """
461
+ Transform a single row (dict) to formatted text.
462
+
463
+ Args:
464
+ row: Dictionary representing a data row.
465
+ format_as: Output format override.
466
+
467
+ Returns:
468
+ Formatted text representation of the row.
469
+ """
470
+ if not row:
471
+ return ""
472
+
473
+ output_format = format_as if format_as != "auto" else self.config.output_format
474
+
475
+ if output_format == "markdown":
476
+ parts = []
477
+ for key, value in row.items():
478
+ if value is not None and str(value).strip():
479
+ parts.append(f"**{key}**: {value}")
480
+ return " | ".join(parts)
481
+ else:
482
+ parts = []
483
+ for key, value in row.items():
484
+ if value is not None and str(value).strip():
485
+ parts.append(f"{key}: {value}")
486
+ return " | ".join(parts)
487
+
488
+ def transform_rows(
489
+ self,
490
+ rows: Generator[Dict[str, Any], None, None]
491
+ ) -> Generator[str, None, None]:
492
+ """
493
+ Transform a stream of rows to formatted text.
494
+
495
+ Args:
496
+ rows: Generator yielding row dictionaries.
497
+
498
+ Yields:
499
+ Formatted text for each row.
500
+ """
501
+ for row in rows:
502
+ transformed = self.transform_row(row)
503
+ if transformed:
504
+ yield transformed
505
+
506
+ def excel_row_to_text(
507
+ self,
508
+ headers: List[str],
509
+ row_values: List[Any]
510
+ ) -> str:
511
+ """
512
+ Convert an Excel row to formatted text.
513
+
514
+ Args:
515
+ headers: Column headers.
516
+ row_values: Row values (same length as headers).
517
+
518
+ Returns:
519
+ Formatted text representation.
520
+ """
521
+ if not headers or not row_values:
522
+ return ""
523
+
524
+ parts = []
525
+ for header, value in zip(headers, row_values):
526
+ if value is not None:
527
+ str_value = str(value).strip()
528
+ if str_value:
529
+ if self.config.output_format == "markdown":
530
+ parts.append(f"**{header}**: {str_value}")
531
+ else:
532
+ parts.append(f"{header}: {str_value}")
533
+
534
+ return " | ".join(parts)
535
+
536
+ def get_stats(self) -> Dict[str, int]:
537
+ """
538
+ Return transformation statistics.
539
+
540
+ Returns:
541
+ Dictionary with transformation counts.
542
+ """
543
+ return dict(self._stats)
544
+
545
+ def reset_stats(self) -> None:
546
+ """Reset internal statistics counters."""
547
+ self._stats = {
548
+ "tables_transformed": 0,
549
+ "json_objects_transformed": 0,
550
+ "rows_processed": 0,
551
+ }
Binary file