ostruct-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,650 @@
1
+ """Template filters for Jinja2 environment."""
2
+
3
+ import datetime
4
+ import itertools
5
+ import json
6
+ import logging
7
+ import re
8
+ import textwrap
9
+ from collections import Counter
10
+ from typing import Any, Dict, List, Optional, Sequence, TypeVar, Union
11
+
12
+ import tiktoken
13
+ from jinja2 import Environment
14
+ from pygments import highlight
15
+ from pygments.formatters import HtmlFormatter, NullFormatter, TerminalFormatter
16
+ from pygments.lexers import TextLexer, get_lexer_by_name, guess_lexer
17
+ from pygments.util import ClassNotFound
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ T = TypeVar("T")
22
+
23
+
24
+ def extract_keywords(text: str) -> List[str]:
25
+ """Extract keywords from text."""
26
+ return text.split()
27
+
28
+
29
+ def word_count(text: str) -> int:
30
+ """Count words in text."""
31
+ return len(text.split())
32
+
33
+
34
+ def char_count(text: str) -> int:
35
+ """Count characters in text."""
36
+ return len(text)
37
+
38
+
39
+ def to_json(obj: Any) -> str:
40
+ """Convert object to JSON string."""
41
+ return json.dumps(obj, indent=2)
42
+
43
+
44
+ def from_json(text: str) -> Any:
45
+ """Parse JSON string to object."""
46
+ return json.loads(text)
47
+
48
+
49
+ def remove_comments(text: str) -> str:
50
+ """Remove comments from text."""
51
+ return re.sub(r"#.*$|//.*$|/\*[\s\S]*?\*/", "", text, flags=re.MULTILINE)
52
+
53
+
54
+ def wrap_text(text: str, width: int = 80) -> str:
55
+ """Wrap text to specified width."""
56
+ return textwrap.fill(text, width)
57
+
58
+
59
+ def indent_text(text: str, width: int = 4) -> str:
60
+ """Indent text by specified width."""
61
+ return textwrap.indent(text, " " * width)
62
+
63
+
64
+ def dedent_text(text: str) -> str:
65
+ """Remove common leading whitespace from text."""
66
+ return textwrap.dedent(text)
67
+
68
+
69
+ def normalize_text(text: str) -> str:
70
+ """Normalize whitespace in text."""
71
+ return " ".join(text.split())
72
+
73
+
74
+ def strip_markdown(text: str) -> str:
75
+ """Remove markdown formatting characters."""
76
+ return re.sub(r"[#*`_~]", "", text)
77
+
78
+
79
+ def format_table(headers: Sequence[Any], rows: Sequence[Sequence[Any]]) -> str:
80
+ """Format data as a markdown table."""
81
+ return (
82
+ f"| {' | '.join(str(h) for h in headers)} |\n"
83
+ f"| {' | '.join('-' * max(len(str(h)), 3) for h in headers)} |\n"
84
+ + "\n".join(
85
+ f"| {' | '.join(str(cell) for cell in row)} |" for row in rows
86
+ )
87
+ )
88
+
89
+
90
+ def align_table(
91
+ headers: Sequence[Any],
92
+ rows: Sequence[Sequence[Any]],
93
+ alignments: Optional[Sequence[str]] = None,
94
+ ) -> str:
95
+ """Format table with column alignments."""
96
+ alignments_list = alignments or ["left"] * len(headers)
97
+ alignment_markers = []
98
+ for a in alignments_list:
99
+ if a == "center":
100
+ alignment_markers.append(":---:")
101
+ elif a == "left":
102
+ alignment_markers.append(":---")
103
+ elif a == "right":
104
+ alignment_markers.append("---:")
105
+ else:
106
+ alignment_markers.append("---")
107
+
108
+ return (
109
+ f"| {' | '.join(str(h) for h in headers)} |\n"
110
+ f"| {' | '.join(alignment_markers)} |\n"
111
+ + "\n".join(
112
+ f"| {' | '.join(str(cell) for cell in row)} |" for row in rows
113
+ )
114
+ )
115
+
116
+
117
+ def dict_to_table(data: Dict[Any, Any]) -> str:
118
+ """Convert dictionary to markdown table."""
119
+ return "| Key | Value |\n| --- | --- |\n" + "\n".join(
120
+ f"| {k} | {v} |" for k, v in data.items()
121
+ )
122
+
123
+
124
+ def list_to_table(
125
+ items: Sequence[Any], headers: Optional[Sequence[str]] = None
126
+ ) -> str:
127
+ """Convert list to markdown table."""
128
+ if not headers:
129
+ return "| # | Value |\n| --- | --- |\n" + "\n".join(
130
+ f"| {i+1} | {item} |" for i, item in enumerate(items)
131
+ )
132
+ return (
133
+ f"| {' | '.join(headers)} |\n| {' | '.join('-' * len(h) for h in headers)} |\n"
134
+ + "\n".join(
135
+ f"| {' | '.join(str(cell) for cell in row)} |" for row in items
136
+ )
137
+ )
138
+
139
+
140
+ def escape_special(text: str) -> str:
141
+ """Escape special characters in text."""
142
+ return re.sub(r'([{}\[\]"\'\\])', r"\\\1", text)
143
+
144
+
145
+ def debug_print(x: Any) -> None:
146
+ """Print debug information."""
147
+ print(f"DEBUG: {x}")
148
+
149
+
150
+ def type_of(x: Any) -> str:
151
+ """Get type name of object."""
152
+ return type(x).__name__
153
+
154
+
155
+ def dir_of(x: Any) -> List[str]:
156
+ """Get list of attributes."""
157
+ return dir(x)
158
+
159
+
160
+ def len_of(x: Any) -> Optional[int]:
161
+ """Get length of object if available."""
162
+ return len(x) if hasattr(x, "__len__") else None
163
+
164
+
165
+ def validate_json(text: str) -> bool:
166
+ """Check if text is valid JSON."""
167
+ if not text:
168
+ return False
169
+ try:
170
+ json.loads(text)
171
+ return True
172
+ except json.JSONDecodeError:
173
+ return False
174
+
175
+
176
+ def format_error(e: Exception) -> str:
177
+ """Format exception as string."""
178
+ return f"{type(e).__name__}: {str(e)}"
179
+
180
+
181
+ def estimate_tokens(text: str) -> int:
182
+ """Estimate number of tokens in text."""
183
+ try:
184
+ encoding = tiktoken.encoding_for_model("gpt-4")
185
+ return len(encoding.encode(str(text)))
186
+ except Exception as e:
187
+ logger.warning(f"Failed to estimate tokens: {e}")
188
+ return len(str(text).split())
189
+
190
+
191
+ def format_json(obj: Any) -> str:
192
+ """Format JSON with indentation."""
193
+ return json.dumps(obj, indent=2, default=str)
194
+
195
+
196
+ def auto_table(data: Any) -> str:
197
+ """Format data as table based on type."""
198
+ if isinstance(data, dict):
199
+ return dict_to_table(data)
200
+ if isinstance(data, (list, tuple)):
201
+ return list_to_table(data)
202
+ return str(data)
203
+
204
+
205
+ def sort_by(items: Sequence[T], key: str) -> List[T]:
206
+ """Sort items by key."""
207
+
208
+ def get_key(x: T) -> Any:
209
+ if isinstance(x, dict):
210
+ return x.get(key, 0)
211
+ return getattr(x, key, 0)
212
+
213
+ return sorted(items, key=get_key)
214
+
215
+
216
+ def group_by(items: Sequence[T], key: str) -> Dict[Any, List[T]]:
217
+ """Group items by key."""
218
+
219
+ def safe_get_key(x: T) -> Any:
220
+ if isinstance(x, dict):
221
+ return x.get(key)
222
+ return getattr(x, key, None)
223
+
224
+ sorted_items = sorted(items, key=safe_get_key)
225
+ return {
226
+ k: list(g)
227
+ for k, g in itertools.groupby(sorted_items, key=safe_get_key)
228
+ }
229
+
230
+
231
+ def filter_by(items: Sequence[T], key: str, value: Any) -> List[T]:
232
+ """Filter items by key-value pair."""
233
+ return [
234
+ x
235
+ for x in items
236
+ if (x.get(key) if isinstance(x, dict) else getattr(x, key, None))
237
+ == value
238
+ ]
239
+
240
+
241
+ def extract_field(items: Sequence[Any], key: str) -> List[Any]:
242
+ """Extract field from each item."""
243
+ return [
244
+ x.get(key) if isinstance(x, dict) else getattr(x, key, None)
245
+ for x in items
246
+ ]
247
+
248
+
249
+ def frequency(items: Sequence[T]) -> Dict[T, int]:
250
+ """Count frequency of items."""
251
+ return dict(Counter(items))
252
+
253
+
254
+ def aggregate(
255
+ items: Sequence[Any], key: Optional[str] = None
256
+ ) -> Dict[str, Union[int, float]]:
257
+ """Calculate aggregate statistics."""
258
+ if not items:
259
+ return {"count": 0, "sum": 0, "avg": 0, "min": 0, "max": 0}
260
+
261
+ def get_value(x: Any) -> float:
262
+ if key is None:
263
+ if isinstance(x, (int, float)):
264
+ return float(x)
265
+ raise ValueError(f"Cannot convert {type(x)} to float")
266
+ val = x.get(key) if isinstance(x, dict) else getattr(x, key, 0)
267
+ if val is None:
268
+ return 0.0
269
+ return float(val)
270
+
271
+ values = [get_value(x) for x in items]
272
+ return {
273
+ "count": len(values),
274
+ "sum": sum(values),
275
+ "avg": sum(values) / len(values),
276
+ "min": min(values),
277
+ "max": max(values),
278
+ }
279
+
280
+
281
+ def unique(items: Sequence[Any]) -> List[Any]:
282
+ """Get unique values while preserving order."""
283
+ return list(dict.fromkeys(items))
284
+
285
+
286
+ def pivot_table(
287
+ data: Sequence[Dict[str, Any]],
288
+ index: str,
289
+ value: str,
290
+ aggfunc: str = "sum",
291
+ ) -> Dict[str, Dict[str, Any]]:
292
+ """Create pivot table from data."""
293
+ if not data:
294
+ logger.debug("Empty data provided to pivot_table")
295
+ return {
296
+ "aggregates": {},
297
+ "metadata": {"total_records": 0, "null_index_count": 0},
298
+ }
299
+
300
+ # Validate aggfunc
301
+ valid_aggfuncs = {"sum", "mean", "count"}
302
+ if aggfunc not in valid_aggfuncs:
303
+ raise ValueError(
304
+ f"Invalid aggfunc: {aggfunc}. Must be one of {valid_aggfuncs}"
305
+ )
306
+
307
+ # Validate columns exist in first row
308
+ if data and (index not in data[0] or value not in data[0]):
309
+ missing = []
310
+ if index not in data[0]:
311
+ missing.append(f"index column '{index}'")
312
+ if value not in data[0]:
313
+ missing.append(f"value column '{value}'")
314
+ raise ValueError(f"Missing required columns: {', '.join(missing)}")
315
+
316
+ # Count records with null index
317
+ null_index_count = sum(1 for row in data if row.get(index) is None)
318
+ if null_index_count:
319
+ logger.warning(f"Found {null_index_count} rows with null index values")
320
+
321
+ # Group by index
322
+ groups: Dict[str, List[float]] = {}
323
+ invalid_values = 0
324
+ for row in data:
325
+ idx = str(row.get(index, ""))
326
+ try:
327
+ val = float(row.get(value, 0))
328
+ except (TypeError, ValueError):
329
+ invalid_values += 1
330
+ logger.warning(
331
+ f"Invalid value for {value} in row with index {idx}, using 0"
332
+ )
333
+ val = 0.0
334
+
335
+ if idx not in groups:
336
+ groups[idx] = []
337
+ groups[idx].append(val)
338
+
339
+ if invalid_values:
340
+ logger.warning(
341
+ f"Found {invalid_values} invalid values in column {value}"
342
+ )
343
+
344
+ result: Dict[str, Dict[str, Any]] = {"aggregates": {}, "metadata": {}}
345
+ for idx, values in groups.items():
346
+ if aggfunc == "sum":
347
+ result["aggregates"][idx] = {"value": sum(values)}
348
+ elif aggfunc == "mean":
349
+ result["aggregates"][idx] = {"value": sum(values) / len(values)}
350
+ else: # count
351
+ result["aggregates"][idx] = {"value": len(values)}
352
+
353
+ result["metadata"] = {
354
+ "total_records": len(data),
355
+ "null_index_count": null_index_count,
356
+ "invalid_values": invalid_values,
357
+ }
358
+ return result
359
+
360
+
361
+ def summarize(
362
+ data: Sequence[Any], keys: Optional[Sequence[str]] = None
363
+ ) -> Dict[str, Any]:
364
+ """Generate summary statistics for data fields."""
365
+ if not data:
366
+ logger.debug("Empty data provided to summarize")
367
+ return {"total_records": 0, "fields": {}}
368
+
369
+ # Validate data type
370
+ if not isinstance(data[0], dict) and not hasattr(data[0], "__dict__"):
371
+ raise TypeError("Data items must be dictionaries or objects")
372
+
373
+ def get_field_value(item: Any, field: str) -> Any:
374
+ try:
375
+ if isinstance(item, dict):
376
+ return item.get(field)
377
+ return getattr(item, field, None)
378
+ except Exception as e:
379
+ logger.warning(f"Error accessing field {field}: {e}")
380
+ return None
381
+
382
+ def get_field_type(values: List[Any]) -> str:
383
+ """Determine field type from non-null values."""
384
+ non_null = [v for v in values if v is not None]
385
+ if not non_null:
386
+ return "NoneType"
387
+
388
+ # Check if all values are of the same type
389
+ types = {type(v) for v in non_null}
390
+ if len(types) == 1:
391
+ return next(iter(types)).__name__
392
+
393
+ # Handle mixed numeric types
394
+ if all(isinstance(v, (int, float)) for v in non_null):
395
+ return "number"
396
+
397
+ # Default to most specific common ancestor type
398
+ return "mixed"
399
+
400
+ def analyze_field(field: str) -> Dict[str, Any]:
401
+ logger.debug(f"Analyzing field: {field}")
402
+ values = [get_field_value(x, field) for x in data]
403
+ non_null = [v for v in values if v is not None]
404
+
405
+ stats = {
406
+ "type": get_field_type(values),
407
+ "total": len(values),
408
+ "null_count": len(values) - len(non_null),
409
+ "unique": len(set(non_null)),
410
+ }
411
+
412
+ # Add numeric statistics if applicable
413
+ if stats["type"] in ("int", "float", "number"):
414
+ try:
415
+ nums = [float(x) for x in non_null]
416
+ stats.update(
417
+ {
418
+ "min": min(nums) if nums else None,
419
+ "max": max(nums) if nums else None,
420
+ "avg": sum(nums) / len(nums) if nums else None,
421
+ }
422
+ )
423
+ except (ValueError, TypeError) as e:
424
+ logger.warning(
425
+ f"Error calculating numeric stats for {field}: {e}"
426
+ )
427
+
428
+ # Add most common values
429
+ if non_null:
430
+ try:
431
+ most_common = Counter(non_null).most_common(5)
432
+ stats["most_common"] = [
433
+ {"value": str(v), "count": c} for v, c in most_common
434
+ ]
435
+ except TypeError as e:
436
+ logger.warning(
437
+ f"Error calculating most common values for {field}: {e}"
438
+ )
439
+
440
+ return stats
441
+
442
+ try:
443
+ available_keys = keys or (
444
+ list(data[0].keys())
445
+ if isinstance(data[0], dict)
446
+ else [k for k in dir(data[0]) if not k.startswith("_")]
447
+ )
448
+
449
+ if not available_keys:
450
+ raise ValueError("No valid keys found in data")
451
+
452
+ logger.debug(
453
+ f"Analyzing {len(data)} records with {len(available_keys)} fields"
454
+ )
455
+ result = {
456
+ "total_records": len(data),
457
+ "fields": {k: analyze_field(k) for k in available_keys},
458
+ }
459
+ logger.debug("Analysis complete")
460
+ return result
461
+
462
+ except Exception as e:
463
+ logger.error(f"Failed to analyze data: {e}", exc_info=True)
464
+ raise ValueError(f"Failed to analyze data: {str(e)}")
465
+
466
+
467
+ def strip_comments(text: str, lang: str = "python") -> str:
468
+ """Remove comments from code text based on language.
469
+
470
+ Args:
471
+ text: Code text to process
472
+ lang: Programming language
473
+
474
+ Returns:
475
+ Text with comments removed if language is supported,
476
+ otherwise returns original text with a warning
477
+ """
478
+ # Define comment patterns for different languages
479
+ single_line_comments = {
480
+ "python": "#",
481
+ "javascript": "//",
482
+ "typescript": "//",
483
+ "java": "//",
484
+ "c": "//",
485
+ "cpp": "//",
486
+ "go": "//",
487
+ "rust": "//",
488
+ "swift": "//",
489
+ "ruby": "#",
490
+ "perl": "#",
491
+ "shell": "#",
492
+ "bash": "#",
493
+ "php": "//",
494
+ }
495
+
496
+ multi_line_comments = {
497
+ "javascript": ("/*", "*/"),
498
+ "typescript": ("/*", "*/"),
499
+ "java": ("/*", "*/"),
500
+ "c": ("/*", "*/"),
501
+ "cpp": ("/*", "*/"),
502
+ "go": ("/*", "*/"),
503
+ "rust": ("/*", "*/"),
504
+ "swift": ("/*", "*/"),
505
+ "php": ("/*", "*/"),
506
+ }
507
+
508
+ # Return original text if language is not supported
509
+ if lang not in single_line_comments and lang not in multi_line_comments:
510
+ logger.debug(
511
+ f"Language '{lang}' is not supported for comment removal. "
512
+ f"Comments will be preserved in the output."
513
+ )
514
+ return text
515
+
516
+ lines = text.splitlines()
517
+ cleaned_lines = []
518
+
519
+ # Handle single-line comments
520
+ if lang in single_line_comments:
521
+ comment_char = single_line_comments[lang]
522
+ for line in lines:
523
+ # Remove inline comments
524
+ line = re.sub(f"\\s*{re.escape(comment_char)}.*$", "", line)
525
+ # Keep non-empty lines
526
+ if line.strip():
527
+ cleaned_lines.append(line)
528
+ text = "\n".join(cleaned_lines)
529
+
530
+ # Handle multi-line comments
531
+ if lang in multi_line_comments:
532
+ start, end = multi_line_comments[lang]
533
+ # Remove multi-line comments
534
+ text = re.sub(
535
+ f"{re.escape(start)}.*?{re.escape(end)}", "", text, flags=re.DOTALL
536
+ )
537
+
538
+ return text
539
+
540
+
541
+ def format_code(
542
+ text: str, output_format: str = "terminal", language: str = "python"
543
+ ) -> str:
544
+ """Format code with syntax highlighting.
545
+
546
+ Args:
547
+ text (str): The code text to format
548
+ output_format (str): The output format ('terminal', 'html', or 'plain')
549
+ language (str): The programming language for syntax highlighting
550
+
551
+ Returns:
552
+ str: Formatted code string
553
+
554
+ Raises:
555
+ ValueError: If output_format is not one of 'terminal', 'html', or 'plain'
556
+ """
557
+ if not text:
558
+ return ""
559
+
560
+ if output_format not in ["terminal", "html", "plain"]:
561
+ raise ValueError(
562
+ "output_format must be one of 'terminal', 'html', or 'plain'"
563
+ )
564
+
565
+ try:
566
+ lexer = get_lexer_by_name(language)
567
+ except ClassNotFound:
568
+ try:
569
+ lexer = guess_lexer(text)
570
+ except ClassNotFound:
571
+ lexer = TextLexer()
572
+
573
+ try:
574
+ if output_format == "terminal":
575
+ formatter: Union[
576
+ TerminalFormatter[str], HtmlFormatter[str], NullFormatter[str]
577
+ ] = TerminalFormatter[str]()
578
+ elif output_format == "html":
579
+ formatter = HtmlFormatter[str]()
580
+ else: # plain
581
+ formatter = NullFormatter[str]()
582
+
583
+ return highlight(text, lexer, formatter)
584
+ except Exception as e:
585
+ logger.error(f"Error formatting code: {e}")
586
+ return text
587
+
588
+
589
+ def register_template_filters(env: Environment) -> None:
590
+ """Register all template filters with the Jinja2 environment.
591
+
592
+ Args:
593
+ env: The Jinja2 environment to register filters with.
594
+ """
595
+ filters = {
596
+ # Text processing
597
+ "extract_keywords": extract_keywords,
598
+ "word_count": word_count,
599
+ "char_count": char_count,
600
+ "to_json": to_json,
601
+ "from_json": from_json,
602
+ "remove_comments": remove_comments,
603
+ "wrap": wrap_text,
604
+ "indent": indent_text,
605
+ "dedent": dedent_text,
606
+ "normalize": normalize_text,
607
+ "strip_markdown": strip_markdown,
608
+ # Data processing
609
+ "sort_by": sort_by,
610
+ "group_by": group_by,
611
+ "filter_by": filter_by,
612
+ "extract_field": extract_field,
613
+ "unique": unique,
614
+ "frequency": frequency,
615
+ "aggregate": aggregate,
616
+ # Table formatting
617
+ "table": format_table,
618
+ "align_table": align_table,
619
+ "dict_to_table": dict_to_table,
620
+ "list_to_table": list_to_table,
621
+ # Code processing
622
+ "format_code": format_code,
623
+ "strip_comments": strip_comments,
624
+ # Special character handling
625
+ "escape_special": escape_special,
626
+ # Table utilities
627
+ "auto_table": auto_table,
628
+ }
629
+
630
+ env.filters.update(filters)
631
+
632
+ # Add template globals
633
+ env.globals.update(
634
+ {
635
+ "estimate_tokens": estimate_tokens,
636
+ "format_json": format_json,
637
+ "now": datetime.datetime.now,
638
+ "debug": debug_print,
639
+ "type_of": type_of,
640
+ "dir_of": dir_of,
641
+ "len_of": len_of,
642
+ "validate_json": validate_json,
643
+ "format_error": format_error,
644
+ # Data analysis globals
645
+ "summarize": summarize,
646
+ "pivot_table": pivot_table,
647
+ # Table utilities
648
+ "auto_table": auto_table,
649
+ }
650
+ )