trustgraph-cli 1.3.12__tar.gz → 1.3.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of trustgraph-cli might be problematic. Click here for more details.

Files changed (72) hide show
  1. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/PKG-INFO +1 -1
  2. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/load_structured_data.py +274 -361
  3. trustgraph_cli-1.3.13/trustgraph/cli_version.py +1 -0
  4. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph_cli.egg-info/PKG-INFO +1 -1
  5. trustgraph_cli-1.3.12/trustgraph/cli_version.py +0 -1
  6. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/README.md +0 -0
  7. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/pyproject.toml +0 -0
  8. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/setup.cfg +0 -0
  9. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/__init__.py +0 -0
  10. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/add_library_document.py +0 -0
  11. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/delete_config_item.py +0 -0
  12. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/delete_flow_class.py +0 -0
  13. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/delete_kg_core.py +0 -0
  14. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/delete_mcp_tool.py +0 -0
  15. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/delete_tool.py +0 -0
  16. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/dump_msgpack.py +0 -0
  17. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/get_config_item.py +0 -0
  18. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/get_flow_class.py +0 -0
  19. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/get_kg_core.py +0 -0
  20. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/graph_to_turtle.py +0 -0
  21. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/init_pulsar_manager.py +0 -0
  22. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/init_trustgraph.py +0 -0
  23. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/invoke_agent.py +0 -0
  24. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/invoke_document_rag.py +0 -0
  25. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/invoke_graph_rag.py +0 -0
  26. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/invoke_llm.py +0 -0
  27. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/invoke_mcp_tool.py +0 -0
  28. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/invoke_nlp_query.py +0 -0
  29. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/invoke_objects_query.py +0 -0
  30. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/invoke_prompt.py +0 -0
  31. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/invoke_structured_query.py +0 -0
  32. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/list_config_items.py +0 -0
  33. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/load_doc_embeds.py +0 -0
  34. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/load_kg_core.py +0 -0
  35. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/load_knowledge.py +0 -0
  36. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/load_pdf.py +0 -0
  37. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/load_sample_documents.py +0 -0
  38. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/load_text.py +0 -0
  39. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/load_turtle.py +0 -0
  40. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/put_config_item.py +0 -0
  41. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/put_flow_class.py +0 -0
  42. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/put_kg_core.py +0 -0
  43. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/remove_library_document.py +0 -0
  44. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/save_doc_embeds.py +0 -0
  45. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/set_mcp_tool.py +0 -0
  46. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/set_prompt.py +0 -0
  47. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/set_token_costs.py +0 -0
  48. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/set_tool.py +0 -0
  49. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/show_config.py +0 -0
  50. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/show_flow_classes.py +0 -0
  51. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/show_flow_state.py +0 -0
  52. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/show_flows.py +0 -0
  53. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/show_graph.py +0 -0
  54. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/show_kg_cores.py +0 -0
  55. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/show_library_documents.py +0 -0
  56. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/show_library_processing.py +0 -0
  57. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/show_mcp_tools.py +0 -0
  58. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/show_processor_state.py +0 -0
  59. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/show_prompts.py +0 -0
  60. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/show_token_costs.py +0 -0
  61. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/show_token_rate.py +0 -0
  62. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/show_tools.py +0 -0
  63. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/start_flow.py +0 -0
  64. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/start_library_processing.py +0 -0
  65. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/stop_flow.py +0 -0
  66. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/stop_library_processing.py +0 -0
  67. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/unload_kg_core.py +0 -0
  68. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph_cli.egg-info/SOURCES.txt +0 -0
  69. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph_cli.egg-info/dependency_links.txt +0 -0
  70. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph_cli.egg-info/entry_points.txt +0 -0
  71. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph_cli.egg-info/requires.txt +0 -0
  72. {trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph_cli.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: trustgraph-cli
3
- Version: 1.3.12
3
+ Version: 1.3.13
4
4
  Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
5
5
  Author-email: "trustgraph.ai" <security@trustgraph.ai>
6
6
  Project-URL: Homepage, https://github.com/trustgraph-ai/trustgraph
@@ -31,6 +31,7 @@ def load_structured_data(
31
31
  suggest_schema: bool = False,
32
32
  generate_descriptor: bool = False,
33
33
  parse_only: bool = False,
34
+ auto: bool = False,
34
35
  output_file: str = None,
35
36
  sample_size: int = 100,
36
37
  sample_chars: int = 500,
@@ -49,6 +50,7 @@ def load_structured_data(
49
50
  suggest_schema: Analyze data and suggest matching schemas
50
51
  generate_descriptor: Generate descriptor from data sample
51
52
  parse_only: Parse data but don't import to TrustGraph
53
+ auto: Run full automatic pipeline (suggest schema + generate descriptor + import)
52
54
  output_file: Path to write output (descriptor/parsed data)
53
55
  sample_size: Number of records to sample for analysis
54
56
  sample_chars: Maximum characters to read for sampling
@@ -62,7 +64,90 @@ def load_structured_data(
62
64
  logging.basicConfig(level=logging.INFO)
63
65
 
64
66
  # Determine operation mode
65
- if suggest_schema:
67
+ if auto:
68
+ logger.info(f"🚀 Starting automatic pipeline for {input_file}...")
69
+ logger.info("Step 1: Analyzing data to discover best matching schema...")
70
+
71
+ # Step 1: Auto-discover schema (reuse suggest_schema logic)
72
+ discovered_schema = _auto_discover_schema(api_url, input_file, sample_chars, logger)
73
+ if not discovered_schema:
74
+ logger.error("Failed to discover suitable schema automatically")
75
+ print("❌ Could not automatically determine the best schema for your data.")
76
+ print("💡 Try running with --suggest-schema first to see available options.")
77
+ return None
78
+
79
+ logger.info(f"✅ Discovered schema: {discovered_schema}")
80
+ print(f"🎯 Auto-selected schema: {discovered_schema}")
81
+
82
+ # Step 2: Auto-generate descriptor
83
+ logger.info("Step 2: Generating descriptor configuration...")
84
+ auto_descriptor = _auto_generate_descriptor(api_url, input_file, discovered_schema, sample_chars, logger)
85
+ if not auto_descriptor:
86
+ logger.error("Failed to generate descriptor automatically")
87
+ print("❌ Could not automatically generate descriptor configuration.")
88
+ return None
89
+
90
+ logger.info("✅ Generated descriptor configuration")
91
+ print("📝 Generated descriptor configuration")
92
+
93
+ # Step 3: Parse and preview data
94
+ logger.info("Step 3: Parsing and validating data...")
95
+ preview_records = _auto_parse_preview(input_file, auto_descriptor, min(sample_size, 5), logger)
96
+ if preview_records is None:
97
+ logger.error("Failed to parse data with generated descriptor")
98
+ print("❌ Could not parse data with generated descriptor.")
99
+ return None
100
+
101
+ # Show preview
102
+ print("📊 Data Preview (first few records):")
103
+ print("=" * 50)
104
+ for i, record in enumerate(preview_records[:3], 1):
105
+ print(f"Record {i}: {record}")
106
+ print("=" * 50)
107
+
108
+ # Step 4: Import (unless dry_run)
109
+ if dry_run:
110
+ logger.info("✅ Dry run complete - data is ready for import")
111
+ print("✅ Dry run successful! Data is ready for import.")
112
+ print(f"💡 Run without --dry-run to import {len(preview_records)} records to TrustGraph.")
113
+ return None
114
+ else:
115
+ logger.info("Step 4: Importing data to TrustGraph...")
116
+ print("🚀 Importing data to TrustGraph...")
117
+
118
+ # Recursively call ourselves with the auto-generated descriptor
119
+ # This reuses all the existing import logic
120
+ import tempfile
121
+ import os
122
+
123
+ # Save auto-generated descriptor to temp file
124
+ temp_descriptor = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False)
125
+ json.dump(auto_descriptor, temp_descriptor, indent=2)
126
+ temp_descriptor.close()
127
+
128
+ try:
129
+ # Call the full pipeline mode with our auto-generated descriptor
130
+ result = load_structured_data(
131
+ api_url=api_url,
132
+ input_file=input_file,
133
+ descriptor_file=temp_descriptor.name,
134
+ flow=flow,
135
+ dry_run=False, # We already handled dry_run above
136
+ verbose=verbose
137
+ )
138
+
139
+ print("✅ Auto-import completed successfully!")
140
+ logger.info("Auto-import pipeline completed successfully")
141
+ return result
142
+
143
+ finally:
144
+ # Clean up temp descriptor file
145
+ try:
146
+ os.unlink(temp_descriptor.name)
147
+ except:
148
+ pass
149
+
150
+ elif suggest_schema:
66
151
  logger.info(f"Analyzing {input_file} to suggest schemas...")
67
152
  logger.info(f"Sample size: {sample_size} records")
68
153
  logger.info(f"Sample chars: {sample_chars} characters")
@@ -497,123 +582,144 @@ def load_structured_data(
497
582
  print(f"- Records processed: {len(output_records)}")
498
583
  print(f"- Target schema: {schema_name}")
499
584
  print(f"- Field mappings: {len(mappings)}")
500
-
501
- else:
502
- # Full pipeline: parse and import
503
- if not descriptor_file:
504
- # Auto-generate descriptor if not provided
505
- logger.info("No descriptor provided, auto-generating...")
506
- logger.info(f"Schema name: {schema_name}")
507
-
508
- # Read sample data for descriptor generation
585
+
586
+
587
+ # Helper functions for auto mode
588
+ def _auto_discover_schema(api_url, input_file, sample_chars, logger):
589
+ """Auto-discover the best matching schema for the input data"""
590
+ try:
591
+ # Read sample data
592
+ with open(input_file, 'r', encoding='utf-8') as f:
593
+ sample_data = f.read(sample_chars)
594
+
595
+ # Import API modules
596
+ from trustgraph.api import Api
597
+ api = Api(api_url)
598
+ config_api = api.config()
599
+
600
+ # Get available schemas
601
+ schema_keys = config_api.list("schema")
602
+ if not schema_keys:
603
+ logger.error("No schemas available in TrustGraph configuration")
604
+ return None
605
+
606
+ # Get schema definitions
607
+ schemas = {}
608
+ for key in schema_keys:
509
609
  try:
510
- with open(input_file, 'r', encoding='utf-8') as f:
511
- sample_data = f.read(sample_chars)
512
- logger.info(f"Read {len(sample_data)} characters for descriptor generation")
610
+ schema_def = config_api.get("schema", key)
611
+ schemas[key] = schema_def
513
612
  except Exception as e:
514
- logger.error(f"Failed to read input file for descriptor generation: {e}")
515
- raise
516
-
517
- # Generate descriptor using TrustGraph prompt service
518
- try:
519
- from trustgraph.api import Api
520
- from trustgraph.api.types import ConfigKey
521
-
522
- api = Api(api_url)
523
- config_api = api.config()
524
-
525
- # Get available schemas
526
- logger.info("Fetching available schemas for descriptor generation...")
527
- schema_keys = config_api.list("schema")
528
- logger.info(f"Found {len(schema_keys)} schemas: {schema_keys}")
529
-
530
- if not schema_keys:
531
- logger.warning("No schemas found in configuration")
532
- print("No schemas available in TrustGraph configuration")
533
- return
534
-
535
- # Fetch each schema definition
536
- schemas = []
537
- config_keys = [ConfigKey(type="schema", key=key) for key in schema_keys]
538
- schema_values = config_api.get(config_keys)
539
-
540
- for value in schema_values:
541
- try:
542
- schema_def = json.loads(value.value) if isinstance(value.value, str) else value.value
543
- schemas.append(schema_def)
544
- logger.debug(f"Loaded schema: {value.key}")
545
- except json.JSONDecodeError as e:
546
- logger.warning(f"Failed to parse schema {value.key}: {e}")
547
- continue
548
-
549
- logger.info(f"Successfully loaded {len(schemas)} schema definitions")
550
-
551
- # Generate descriptor using diagnose-structured-data prompt
552
- flow_api = api.flow().id(flow)
613
+ logger.warning(f"Could not load schema {key}: {e}")
553
614
 
554
- logger.info("Calling TrustGraph diagnose-structured-data prompt for descriptor generation...")
555
- response = flow_api.prompt(
556
- id="diagnose-structured-data",
557
- variables={
558
- "schemas": schemas,
559
- "sample": sample_data
560
- }
561
- )
562
-
563
- # Parse the generated descriptor
564
- if isinstance(response, str):
565
- try:
566
- descriptor = json.loads(response)
567
- except json.JSONDecodeError:
568
- logger.error("Generated descriptor is not valid JSON")
569
- raise ValueError("Failed to generate valid descriptor")
570
- else:
571
- descriptor = response
572
-
573
- # Override schema_name if provided
574
- if schema_name:
575
- descriptor.setdefault('output', {})['schema_name'] = schema_name
576
-
577
- logger.info("Successfully generated descriptor from data sample")
578
-
579
- except ImportError as e:
580
- logger.error(f"Failed to import TrustGraph API: {e}")
581
- raise
582
- except Exception as e:
583
- logger.error(f"Failed to generate descriptor: {e}")
584
- raise
585
- else:
586
- # Load existing descriptor
587
- try:
588
- with open(descriptor_file, 'r', encoding='utf-8') as f:
589
- descriptor = json.load(f)
590
- logger.info(f"Loaded descriptor configuration from {descriptor_file}")
591
- except Exception as e:
592
- logger.error(f"Failed to load descriptor file: {e}")
593
- raise
615
+ if not schemas:
616
+ logger.error("No valid schemas could be loaded")
617
+ return None
618
+
619
+ # Use prompt service for schema selection
620
+ flow_api = api.flow().id("default")
621
+ prompt_client = flow_api.prompt()
594
622
 
595
- logger.info(f"Processing {input_file} for import...")
623
+ prompt = f"""Analyze this data sample and determine the best matching schema:
624
+
625
+ DATA SAMPLE:
626
+ {sample_data[:1000]}
627
+
628
+ AVAILABLE SCHEMAS:
629
+ {json.dumps(schemas, indent=2)}
630
+
631
+ Return ONLY the schema name (key) that best matches this data. Consider:
632
+ 1. Field names and types in the data
633
+ 2. Data structure and format
634
+ 3. Domain and use case alignment
635
+
636
+ Schema name:"""
637
+
638
+ response = prompt_client.schema_selection(
639
+ schemas=schemas,
640
+ sample=sample_data[:1000]
641
+ )
596
642
 
597
- # Parse data using the same logic as parse-only mode, but with full dataset
598
- try:
599
- format_info = descriptor.get('format', {})
600
- format_type = format_info.get('type', 'csv').lower()
601
- encoding = format_info.get('encoding', 'utf-8')
602
-
603
- logger.info(f"Input format: {format_type}, encoding: {encoding}")
604
-
605
- with open(input_file, 'r', encoding=encoding) as f:
606
- raw_data = f.read()
607
-
608
- logger.info(f"Read {len(raw_data)} characters from input file")
643
+ # Extract schema name from response
644
+ if isinstance(response, dict) and 'schema' in response:
645
+ return response['schema']
646
+ elif isinstance(response, str):
647
+ # Try to extract schema name from text response
648
+ response_lower = response.lower().strip()
649
+ for schema_key in schema_keys:
650
+ if schema_key.lower() in response_lower:
651
+ return schema_key
652
+
653
+ # If no exact match, try first mentioned schema
654
+ words = response.split()
655
+ for word in words:
656
+ clean_word = word.strip('.,!?":').lower()
657
+ if clean_word in [s.lower() for s in schema_keys]:
658
+ matching_schema = next(s for s in schema_keys if s.lower() == clean_word)
659
+ return matching_schema
660
+
661
+ logger.warning(f"Could not parse schema selection from response: {response}")
662
+
663
+ # Fallback: return first available schema
664
+ logger.info(f"Using fallback: first available schema '{schema_keys[0]}'")
665
+ return schema_keys[0]
666
+
667
+ except Exception as e:
668
+ logger.error(f"Schema discovery failed: {e}")
669
+ return None
670
+
671
+
672
+ def _auto_generate_descriptor(api_url, input_file, schema_name, sample_chars, logger):
673
+ """Auto-generate descriptor configuration for the discovered schema"""
674
+ try:
675
+ # Read sample data
676
+ with open(input_file, 'r', encoding='utf-8') as f:
677
+ sample_data = f.read(sample_chars)
678
+
679
+ # Import API modules
680
+ from trustgraph.api import Api
681
+ api = Api(api_url)
682
+ config_api = api.config()
683
+
684
+ # Get schema definition
685
+ schema_def = config_api.get("schema", schema_name)
686
+
687
+ # Use prompt service for descriptor generation
688
+ flow_api = api.flow().id("default")
689
+ prompt_client = flow_api.prompt()
690
+
691
+ response = prompt_client.diagnose_structured_data(
692
+ sample=sample_data,
693
+ schema_name=schema_name,
694
+ schema=schema_def
695
+ )
696
+
697
+ if isinstance(response, str):
698
+ try:
699
+ return json.loads(response)
700
+ except json.JSONDecodeError:
701
+ logger.error("Generated descriptor is not valid JSON")
702
+ return None
703
+ else:
704
+ return response
609
705
 
610
- except Exception as e:
611
- logger.error(f"Failed to read input file: {e}")
612
- raise
706
+ except Exception as e:
707
+ logger.error(f"Descriptor generation failed: {e}")
708
+ return None
709
+
710
+
711
+ def _auto_parse_preview(input_file, descriptor, max_records, logger):
712
+ """Parse and preview data using the auto-generated descriptor"""
713
+ try:
714
+ # Simplified parsing logic for preview (reuse existing logic)
715
+ format_info = descriptor.get('format', {})
716
+ format_type = format_info.get('type', 'csv').lower()
717
+ encoding = format_info.get('encoding', 'utf-8')
718
+
719
+ with open(input_file, 'r', encoding=encoding) as f:
720
+ raw_data = f.read()
613
721
 
614
- # Parse data (reuse parse-only logic but process all records)
615
722
  parsed_records = []
616
- batch_size = descriptor.get('output', {}).get('options', {}).get('batch_size', 1000)
617
723
 
618
724
  if format_type == 'csv':
619
725
  import csv
@@ -623,261 +729,50 @@ def load_structured_data(
623
729
  delimiter = options.get('delimiter', ',')
624
730
  has_header = options.get('has_header', True) or options.get('header', True)
625
731
 
626
- logger.info(f"CSV options - delimiter: '{delimiter}', has_header: {has_header}")
627
-
628
- try:
629
- reader = csv.DictReader(StringIO(raw_data), delimiter=delimiter)
630
- if not has_header:
631
- first_row = next(reader)
632
- fieldnames = [f"field_{i+1}" for i in range(len(first_row))]
633
- reader = csv.DictReader(StringIO(raw_data), fieldnames=fieldnames, delimiter=delimiter)
634
-
635
- record_count = 0
636
- for row in reader:
637
- parsed_records.append(row)
638
- record_count += 1
639
-
640
- # Process in batches to avoid memory issues
641
- if record_count % batch_size == 0:
642
- logger.info(f"Parsed {record_count} records...")
643
-
644
- except Exception as e:
645
- logger.error(f"Failed to parse CSV data: {e}")
646
- raise
732
+ reader = csv.DictReader(StringIO(raw_data), delimiter=delimiter)
733
+ if not has_header:
734
+ first_row = next(reader)
735
+ fieldnames = [f"field_{i+1}" for i in range(len(first_row))]
736
+ reader = csv.DictReader(StringIO(raw_data), fieldnames=fieldnames, delimiter=delimiter)
737
+
738
+ count = 0
739
+ for row in reader:
740
+ if count >= max_records:
741
+ break
742
+ parsed_records.append(dict(row))
743
+ count += 1
647
744
 
648
745
  elif format_type == 'json':
649
- try:
650
- data = json.loads(raw_data)
651
- if isinstance(data, list):
652
- parsed_records = data
653
- elif isinstance(data, dict):
654
- root_path = format_info.get('options', {}).get('root_path')
655
- if root_path:
656
- if root_path.startswith('$.'):
657
- key = root_path[2:]
658
- data = data.get(key, data)
659
-
660
- if isinstance(data, list):
661
- parsed_records = data
662
- else:
663
- parsed_records = [data]
664
-
665
- except Exception as e:
666
- logger.error(f"Failed to parse JSON data: {e}")
667
- raise
668
-
669
- elif format_type == 'xml':
670
- import xml.etree.ElementTree as ET
671
-
672
- options = format_info.get('options', {})
673
- record_path = options.get('record_path', '//record')
674
- field_attribute = options.get('field_attribute')
746
+ import json
747
+ data = json.loads(raw_data)
675
748
 
676
- # Legacy support for old options format
677
- if 'root_element' in options or 'record_element' in options:
678
- root_element = options.get('root_element')
679
- record_element = options.get('record_element', 'record')
680
- if root_element:
681
- record_path = f"//{root_element}/{record_element}"
682
- else:
683
- record_path = f"//{record_element}"
684
-
685
- logger.info(f"XML options - record_path: '{record_path}', field_attribute: '{field_attribute}'")
686
-
687
- try:
688
- root = ET.fromstring(raw_data)
689
-
690
- # Find record elements using XPath
691
- xpath_expr = record_path
692
- if xpath_expr.startswith('/ROOT/'):
693
- xpath_expr = xpath_expr[6:]
694
- elif xpath_expr.startswith('/'):
695
- xpath_expr = '.' + xpath_expr
696
-
697
- records = root.findall(xpath_expr)
698
- logger.info(f"Found {len(records)} records using XPath: {record_path} (converted to: {xpath_expr})")
699
-
700
- # Convert XML elements to dictionaries
701
- for element in records:
702
- record = {}
703
-
704
- if field_attribute:
705
- # Handle field elements with name attributes (UN data format)
706
- for child in element:
707
- if child.tag == 'field' and field_attribute in child.attrib:
708
- field_name = child.attrib[field_attribute]
709
- field_value = child.text.strip() if child.text else ""
710
- record[field_name] = field_value
711
- else:
712
- # Handle standard XML structure
713
- record.update(element.attrib)
714
-
715
- for child in element:
716
- if child.text:
717
- record[child.tag] = child.text.strip()
718
- else:
719
- record[child.tag] = ""
720
-
721
- if not record and element.text:
722
- record['value'] = element.text.strip()
723
-
724
- parsed_records.append(record)
725
-
726
- except ET.ParseError as e:
727
- logger.error(f"Failed to parse XML data: {e}")
728
- raise
729
- except Exception as e:
730
- logger.error(f"Failed to process XML data: {e}")
731
- raise
749
+ if isinstance(data, list):
750
+ parsed_records = data[:max_records]
751
+ else:
752
+ parsed_records = [data]
732
753
 
733
- else:
734
- raise ValueError(f"Unsupported format type: {format_type}")
735
-
736
- logger.info(f"Successfully parsed {len(parsed_records)} records")
737
-
738
- # Apply transformations and create TrustGraph objects
754
+ # Apply basic field mappings for preview
739
755
  mappings = descriptor.get('mappings', [])
740
- processed_records = []
741
- schema_name = descriptor.get('output', {}).get('schema_name', 'default')
742
- confidence = descriptor.get('output', {}).get('options', {}).get('confidence', 0.9)
743
-
744
- logger.info(f"Applying {len(mappings)} field mappings...")
756
+ preview_records = []
745
757
 
746
- for record_num, record in enumerate(parsed_records, start=1):
758
+ for record in parsed_records:
747
759
  processed_record = {}
748
-
749
760
  for mapping in mappings:
750
- source_field = mapping.get('source_field') or mapping.get('source')
751
- target_field = mapping.get('target_field') or mapping.get('target')
761
+ source_field = mapping.get('source_field')
762
+ target_field = mapping.get('target_field', source_field)
752
763
 
753
764
  if source_field in record:
754
765
  value = record[source_field]
755
-
756
- # Apply transforms
757
- transforms = mapping.get('transforms', [])
758
- for transform in transforms:
759
- transform_type = transform.get('type')
760
-
761
- if transform_type == 'trim' and isinstance(value, str):
762
- value = value.strip()
763
- elif transform_type == 'upper' and isinstance(value, str):
764
- value = value.upper()
765
- elif transform_type == 'lower' and isinstance(value, str):
766
- value = value.lower()
767
- elif transform_type == 'title_case' and isinstance(value, str):
768
- value = value.title()
769
- elif transform_type == 'to_int':
770
- try:
771
- value = int(value) if value != '' else None
772
- except (ValueError, TypeError):
773
- logger.warning(f"Failed to convert '{value}' to int in record {record_num}")
774
- elif transform_type == 'to_float':
775
- try:
776
- value = float(value) if value != '' else None
777
- except (ValueError, TypeError):
778
- logger.warning(f"Failed to convert '{value}' to float in record {record_num}")
779
-
780
- # Convert all values to strings as required by ExtractedObject schema
781
766
  processed_record[target_field] = str(value) if value is not None else ""
782
- else:
783
- logger.warning(f"Source field '{source_field}' not found in record {record_num}")
784
-
785
- # Create TrustGraph ExtractedObject
786
- output_record = {
787
- "metadata": {
788
- "id": f"import-{record_num}",
789
- "metadata": [],
790
- "user": "trustgraph",
791
- "collection": "default"
792
- },
793
- "schema_name": schema_name,
794
- "values": processed_record,
795
- "confidence": confidence,
796
- "source_span": ""
797
- }
798
- processed_records.append(output_record)
799
-
800
- logger.info(f"Processed {len(processed_records)} records with transformations")
801
-
802
- if dry_run:
803
- print(f"Dry run mode - would import {len(processed_records)} records to TrustGraph")
804
- print(f"Target schema: {schema_name}")
805
- print(f"Sample record:")
806
- if processed_records:
807
- # Show what the batched format will look like
808
- sample_batch = processed_records[:min(3, len(processed_records))]
809
- batch_values = [record["values"] for record in sample_batch]
810
- first_record = processed_records[0]
811
- batched_sample = {
812
- "metadata": first_record["metadata"],
813
- "schema_name": first_record["schema_name"],
814
- "values": batch_values,
815
- "confidence": first_record["confidence"],
816
- "source_span": first_record["source_span"]
817
- }
818
- print(json.dumps(batched_sample, indent=2))
819
- return
767
+
768
+ if processed_record: # Only add if we got some data
769
+ preview_records.append(processed_record)
820
770
 
821
- # Import to TrustGraph using objects import endpoint via WebSocket
822
- logger.info(f"Importing {len(processed_records)} records to TrustGraph...")
771
+ return preview_records if preview_records else parsed_records
823
772
 
824
- try:
825
- import asyncio
826
- from websockets.asyncio.client import connect
827
-
828
- # Construct objects import URL similar to load_knowledge pattern
829
- if not api_url.endswith("/"):
830
- api_url += "/"
831
-
832
- # Convert HTTP URL to WebSocket URL if needed
833
- ws_url = api_url.replace("http://", "ws://").replace("https://", "wss://")
834
- objects_url = ws_url + f"api/v1/flow/{flow}/import/objects"
835
-
836
- logger.info(f"Connecting to objects import endpoint: {objects_url}")
837
-
838
- async def import_objects():
839
- async with connect(objects_url) as ws:
840
- imported_count = 0
841
-
842
- # Process records in batches
843
- for i in range(0, len(processed_records), batch_size):
844
- batch_records = processed_records[i:i + batch_size]
845
-
846
- # Extract values from each record in the batch
847
- batch_values = [record["values"] for record in batch_records]
848
-
849
- # Create batched ExtractedObject message using first record as template
850
- first_record = batch_records[0]
851
- batched_record = {
852
- "metadata": first_record["metadata"],
853
- "schema_name": first_record["schema_name"],
854
- "values": batch_values, # Array of value dictionaries
855
- "confidence": first_record["confidence"],
856
- "source_span": first_record["source_span"]
857
- }
858
-
859
- # Send batched ExtractedObject
860
- await ws.send(json.dumps(batched_record))
861
- imported_count += len(batch_records)
862
-
863
- if imported_count % 100 == 0:
864
- logger.info(f"Imported {imported_count}/{len(processed_records)} records...")
865
-
866
- logger.info(f"Successfully imported {imported_count} records to TrustGraph")
867
- return imported_count
868
-
869
- # Run the async import
870
- imported_count = asyncio.run(import_objects())
871
- print(f"Import completed: {imported_count} records imported to schema '{schema_name}'")
872
-
873
- except ImportError as e:
874
- logger.error(f"Failed to import required modules: {e}")
875
- print(f"Error: Required modules not available - {e}")
876
- raise
877
- except Exception as e:
878
- logger.error(f"Failed to import data to TrustGraph: {e}")
879
- print(f"Import failed: {e}")
880
- raise
773
+ except Exception as e:
774
+ logger.error(f"Preview parsing failed: {e}")
775
+ return None
881
776
 
882
777
 
883
778
  def main():
@@ -908,26 +803,29 @@ Examples:
908
803
  %(prog)s --input customers.csv --descriptor descriptor.json
909
804
  %(prog)s --input products.xml --descriptor xml_descriptor.json
910
805
 
911
- # All-in-one: Auto-generate descriptor and import (for simple cases)
912
- %(prog)s --input customers.csv --schema-name customer
806
+ # FULLY AUTOMATIC: Discover schema + generate descriptor + import (zero manual steps!)
807
+ %(prog)s --input customers.csv --auto
808
+ %(prog)s --input products.xml --auto --dry-run # Preview before importing
913
809
 
914
810
  # Dry run to validate without importing
915
811
  %(prog)s --input customers.csv --descriptor descriptor.json --dry-run
916
812
 
917
813
  Use Cases:
814
+ --auto : 🚀 FULLY AUTOMATIC: Discover schema + generate descriptor + import data
815
+ (zero manual configuration required!)
918
816
  --suggest-schema : Diagnose which TrustGraph schemas might match your data
919
817
  (uses --sample-chars to limit data sent for analysis)
920
818
  --generate-descriptor: Create/review the structured data language configuration
921
819
  (uses --sample-chars to limit data sent for analysis)
922
820
  --parse-only : Validate that parsed data looks correct before import
923
821
  (uses --sample-size to limit records processed, ignores --sample-chars)
924
- (no mode flags) : Full pipeline - parse and import to TrustGraph
925
822
 
926
823
  For more information on the descriptor format, see:
927
824
  docs/tech-specs/structured-data-descriptor.md
928
- """.strip()
825
+ """,
929
826
  )
930
827
 
828
+ # Required arguments
931
829
  parser.add_argument(
932
830
  '-u', '--api-url',
933
831
  default=default_url,
@@ -968,6 +866,11 @@ For more information on the descriptor format, see:
968
866
  action='store_true',
969
867
  help='Parse data using descriptor but don\'t import to TrustGraph'
970
868
  )
869
+ mode_group.add_argument(
870
+ '--auto',
871
+ action='store_true',
872
+ help='Run full automatic pipeline: discover schema + generate descriptor + import data'
873
+ )
971
874
 
972
875
  parser.add_argument(
973
876
  '-o', '--output',
@@ -1026,7 +929,12 @@ For more information on the descriptor format, see:
1026
929
 
1027
930
  args = parser.parse_args()
1028
931
 
1029
- # Validate argument combinations
932
+ # Input validation
933
+ if not os.path.exists(args.input):
934
+ print(f"Error: Input file not found: {args.input}", file=sys.stderr)
935
+ sys.exit(1)
936
+
937
+ # Mode-specific validation
1030
938
  if args.parse_only and not args.descriptor:
1031
939
  print("Error: --descriptor is required when using --parse-only", file=sys.stderr)
1032
940
  sys.exit(1)
@@ -1038,11 +946,15 @@ For more information on the descriptor format, see:
1038
946
  if (args.suggest_schema or args.generate_descriptor) and args.sample_size != 100: # 100 is default
1039
947
  print("Warning: --sample-size is ignored in analysis modes, use --sample-chars instead", file=sys.stderr)
1040
948
 
1041
- if not any([args.suggest_schema, args.generate_descriptor, args.parse_only]) and not args.descriptor:
1042
- # Full pipeline mode without descriptor - schema_name should be provided
1043
- if not args.schema_name:
1044
- print("Error: --descriptor or --schema-name is required for full import", file=sys.stderr)
1045
- sys.exit(1)
949
+ # Require explicit mode selection - no implicit behavior
950
+ if not any([args.suggest_schema, args.generate_descriptor, args.parse_only, args.auto]):
951
+ print("Error: Must specify an operation mode", file=sys.stderr)
952
+ print("Available modes:", file=sys.stderr)
953
+ print(" --auto : Discover schema + generate descriptor + import", file=sys.stderr)
954
+ print(" --suggest-schema : Analyze data and suggest schemas", file=sys.stderr)
955
+ print(" --generate-descriptor : Generate descriptor from data", file=sys.stderr)
956
+ print(" --parse-only : Parse data without importing", file=sys.stderr)
957
+ sys.exit(1)
1046
958
 
1047
959
  try:
1048
960
  load_structured_data(
@@ -1052,6 +964,7 @@ For more information on the descriptor format, see:
1052
964
  suggest_schema=args.suggest_schema,
1053
965
  generate_descriptor=args.generate_descriptor,
1054
966
  parse_only=args.parse_only,
967
+ auto=args.auto,
1055
968
  output_file=args.output,
1056
969
  sample_size=args.sample_size,
1057
970
  sample_chars=args.sample_chars,
@@ -0,0 +1 @@
1
+ __version__ = "1.3.13"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: trustgraph-cli
3
- Version: 1.3.12
3
+ Version: 1.3.13
4
4
  Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
5
5
  Author-email: "trustgraph.ai" <security@trustgraph.ai>
6
6
  Project-URL: Homepage, https://github.com/trustgraph-ai/trustgraph
@@ -1 +0,0 @@
1
- __version__ = "1.3.12"