trustgraph-cli 1.3.11__tar.gz → 1.3.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of trustgraph-cli might be problematic. Click here for more details.

Files changed (72) hide show
  1. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/PKG-INFO +1 -1
  2. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/load_structured_data.py +274 -334
  3. trustgraph_cli-1.3.13/trustgraph/cli_version.py +1 -0
  4. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph_cli.egg-info/PKG-INFO +1 -1
  5. trustgraph_cli-1.3.11/trustgraph/cli_version.py +0 -1
  6. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/README.md +0 -0
  7. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/pyproject.toml +0 -0
  8. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/setup.cfg +0 -0
  9. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/__init__.py +0 -0
  10. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/add_library_document.py +0 -0
  11. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/delete_config_item.py +0 -0
  12. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/delete_flow_class.py +0 -0
  13. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/delete_kg_core.py +0 -0
  14. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/delete_mcp_tool.py +0 -0
  15. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/delete_tool.py +0 -0
  16. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/dump_msgpack.py +0 -0
  17. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/get_config_item.py +0 -0
  18. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/get_flow_class.py +0 -0
  19. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/get_kg_core.py +0 -0
  20. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/graph_to_turtle.py +0 -0
  21. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/init_pulsar_manager.py +0 -0
  22. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/init_trustgraph.py +0 -0
  23. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/invoke_agent.py +0 -0
  24. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/invoke_document_rag.py +0 -0
  25. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/invoke_graph_rag.py +0 -0
  26. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/invoke_llm.py +0 -0
  27. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/invoke_mcp_tool.py +0 -0
  28. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/invoke_nlp_query.py +0 -0
  29. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/invoke_objects_query.py +0 -0
  30. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/invoke_prompt.py +0 -0
  31. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/invoke_structured_query.py +0 -0
  32. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/list_config_items.py +0 -0
  33. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/load_doc_embeds.py +0 -0
  34. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/load_kg_core.py +0 -0
  35. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/load_knowledge.py +0 -0
  36. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/load_pdf.py +0 -0
  37. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/load_sample_documents.py +0 -0
  38. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/load_text.py +0 -0
  39. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/load_turtle.py +0 -0
  40. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/put_config_item.py +0 -0
  41. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/put_flow_class.py +0 -0
  42. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/put_kg_core.py +0 -0
  43. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/remove_library_document.py +0 -0
  44. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/save_doc_embeds.py +0 -0
  45. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/set_mcp_tool.py +0 -0
  46. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/set_prompt.py +0 -0
  47. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/set_token_costs.py +0 -0
  48. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/set_tool.py +0 -0
  49. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/show_config.py +0 -0
  50. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/show_flow_classes.py +0 -0
  51. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/show_flow_state.py +0 -0
  52. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/show_flows.py +0 -0
  53. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/show_graph.py +0 -0
  54. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/show_kg_cores.py +0 -0
  55. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/show_library_documents.py +0 -0
  56. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/show_library_processing.py +0 -0
  57. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/show_mcp_tools.py +0 -0
  58. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/show_processor_state.py +0 -0
  59. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/show_prompts.py +0 -0
  60. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/show_token_costs.py +0 -0
  61. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/show_token_rate.py +0 -0
  62. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/show_tools.py +0 -0
  63. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/start_flow.py +0 -0
  64. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/start_library_processing.py +0 -0
  65. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/stop_flow.py +0 -0
  66. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/stop_library_processing.py +0 -0
  67. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph/cli/unload_kg_core.py +0 -0
  68. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph_cli.egg-info/SOURCES.txt +0 -0
  69. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph_cli.egg-info/dependency_links.txt +0 -0
  70. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph_cli.egg-info/entry_points.txt +0 -0
  71. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph_cli.egg-info/requires.txt +0 -0
  72. {trustgraph_cli-1.3.11 → trustgraph_cli-1.3.13}/trustgraph_cli.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: trustgraph-cli
3
- Version: 1.3.11
3
+ Version: 1.3.13
4
4
  Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
5
5
  Author-email: "trustgraph.ai" <security@trustgraph.ai>
6
6
  Project-URL: Homepage, https://github.com/trustgraph-ai/trustgraph
@@ -31,6 +31,7 @@ def load_structured_data(
31
31
  suggest_schema: bool = False,
32
32
  generate_descriptor: bool = False,
33
33
  parse_only: bool = False,
34
+ auto: bool = False,
34
35
  output_file: str = None,
35
36
  sample_size: int = 100,
36
37
  sample_chars: int = 500,
@@ -49,6 +50,7 @@ def load_structured_data(
49
50
  suggest_schema: Analyze data and suggest matching schemas
50
51
  generate_descriptor: Generate descriptor from data sample
51
52
  parse_only: Parse data but don't import to TrustGraph
53
+ auto: Run full automatic pipeline (suggest schema + generate descriptor + import)
52
54
  output_file: Path to write output (descriptor/parsed data)
53
55
  sample_size: Number of records to sample for analysis
54
56
  sample_chars: Maximum characters to read for sampling
@@ -62,7 +64,90 @@ def load_structured_data(
62
64
  logging.basicConfig(level=logging.INFO)
63
65
 
64
66
  # Determine operation mode
65
- if suggest_schema:
67
+ if auto:
68
+ logger.info(f"🚀 Starting automatic pipeline for {input_file}...")
69
+ logger.info("Step 1: Analyzing data to discover best matching schema...")
70
+
71
+ # Step 1: Auto-discover schema (reuse suggest_schema logic)
72
+ discovered_schema = _auto_discover_schema(api_url, input_file, sample_chars, logger)
73
+ if not discovered_schema:
74
+ logger.error("Failed to discover suitable schema automatically")
75
+ print("❌ Could not automatically determine the best schema for your data.")
76
+ print("💡 Try running with --suggest-schema first to see available options.")
77
+ return None
78
+
79
+ logger.info(f"✅ Discovered schema: {discovered_schema}")
80
+ print(f"🎯 Auto-selected schema: {discovered_schema}")
81
+
82
+ # Step 2: Auto-generate descriptor
83
+ logger.info("Step 2: Generating descriptor configuration...")
84
+ auto_descriptor = _auto_generate_descriptor(api_url, input_file, discovered_schema, sample_chars, logger)
85
+ if not auto_descriptor:
86
+ logger.error("Failed to generate descriptor automatically")
87
+ print("❌ Could not automatically generate descriptor configuration.")
88
+ return None
89
+
90
+ logger.info("✅ Generated descriptor configuration")
91
+ print("📝 Generated descriptor configuration")
92
+
93
+ # Step 3: Parse and preview data
94
+ logger.info("Step 3: Parsing and validating data...")
95
+ preview_records = _auto_parse_preview(input_file, auto_descriptor, min(sample_size, 5), logger)
96
+ if preview_records is None:
97
+ logger.error("Failed to parse data with generated descriptor")
98
+ print("❌ Could not parse data with generated descriptor.")
99
+ return None
100
+
101
+ # Show preview
102
+ print("📊 Data Preview (first few records):")
103
+ print("=" * 50)
104
+ for i, record in enumerate(preview_records[:3], 1):
105
+ print(f"Record {i}: {record}")
106
+ print("=" * 50)
107
+
108
+ # Step 4: Import (unless dry_run)
109
+ if dry_run:
110
+ logger.info("✅ Dry run complete - data is ready for import")
111
+ print("✅ Dry run successful! Data is ready for import.")
112
+ print(f"💡 Run without --dry-run to import {len(preview_records)} records to TrustGraph.")
113
+ return None
114
+ else:
115
+ logger.info("Step 4: Importing data to TrustGraph...")
116
+ print("🚀 Importing data to TrustGraph...")
117
+
118
+ # Recursively call ourselves with the auto-generated descriptor
119
+ # This reuses all the existing import logic
120
+ import tempfile
121
+ import os
122
+
123
+ # Save auto-generated descriptor to temp file
124
+ temp_descriptor = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False)
125
+ json.dump(auto_descriptor, temp_descriptor, indent=2)
126
+ temp_descriptor.close()
127
+
128
+ try:
129
+ # Call the full pipeline mode with our auto-generated descriptor
130
+ result = load_structured_data(
131
+ api_url=api_url,
132
+ input_file=input_file,
133
+ descriptor_file=temp_descriptor.name,
134
+ flow=flow,
135
+ dry_run=False, # We already handled dry_run above
136
+ verbose=verbose
137
+ )
138
+
139
+ print("✅ Auto-import completed successfully!")
140
+ logger.info("Auto-import pipeline completed successfully")
141
+ return result
142
+
143
+ finally:
144
+ # Clean up temp descriptor file
145
+ try:
146
+ os.unlink(temp_descriptor.name)
147
+ except:
148
+ pass
149
+
150
+ elif suggest_schema:
66
151
  logger.info(f"Analyzing {input_file} to suggest schemas...")
67
152
  logger.info(f"Sample size: {sample_size} records")
68
153
  logger.info(f"Sample chars: {sample_chars} characters")
@@ -497,123 +582,144 @@ def load_structured_data(
497
582
  print(f"- Records processed: {len(output_records)}")
498
583
  print(f"- Target schema: {schema_name}")
499
584
  print(f"- Field mappings: {len(mappings)}")
500
-
501
- else:
502
- # Full pipeline: parse and import
503
- if not descriptor_file:
504
- # Auto-generate descriptor if not provided
505
- logger.info("No descriptor provided, auto-generating...")
506
- logger.info(f"Schema name: {schema_name}")
507
-
508
- # Read sample data for descriptor generation
585
+
586
+
587
+ # Helper functions for auto mode
588
+ def _auto_discover_schema(api_url, input_file, sample_chars, logger):
589
+ """Auto-discover the best matching schema for the input data"""
590
+ try:
591
+ # Read sample data
592
+ with open(input_file, 'r', encoding='utf-8') as f:
593
+ sample_data = f.read(sample_chars)
594
+
595
+ # Import API modules
596
+ from trustgraph.api import Api
597
+ api = Api(api_url)
598
+ config_api = api.config()
599
+
600
+ # Get available schemas
601
+ schema_keys = config_api.list("schema")
602
+ if not schema_keys:
603
+ logger.error("No schemas available in TrustGraph configuration")
604
+ return None
605
+
606
+ # Get schema definitions
607
+ schemas = {}
608
+ for key in schema_keys:
509
609
  try:
510
- with open(input_file, 'r', encoding='utf-8') as f:
511
- sample_data = f.read(sample_chars)
512
- logger.info(f"Read {len(sample_data)} characters for descriptor generation")
610
+ schema_def = config_api.get("schema", key)
611
+ schemas[key] = schema_def
513
612
  except Exception as e:
514
- logger.error(f"Failed to read input file for descriptor generation: {e}")
515
- raise
516
-
517
- # Generate descriptor using TrustGraph prompt service
518
- try:
519
- from trustgraph.api import Api
520
- from trustgraph.api.types import ConfigKey
521
-
522
- api = Api(api_url)
523
- config_api = api.config()
524
-
525
- # Get available schemas
526
- logger.info("Fetching available schemas for descriptor generation...")
527
- schema_keys = config_api.list("schema")
528
- logger.info(f"Found {len(schema_keys)} schemas: {schema_keys}")
529
-
530
- if not schema_keys:
531
- logger.warning("No schemas found in configuration")
532
- print("No schemas available in TrustGraph configuration")
533
- return
534
-
535
- # Fetch each schema definition
536
- schemas = []
537
- config_keys = [ConfigKey(type="schema", key=key) for key in schema_keys]
538
- schema_values = config_api.get(config_keys)
539
-
540
- for value in schema_values:
541
- try:
542
- schema_def = json.loads(value.value) if isinstance(value.value, str) else value.value
543
- schemas.append(schema_def)
544
- logger.debug(f"Loaded schema: {value.key}")
545
- except json.JSONDecodeError as e:
546
- logger.warning(f"Failed to parse schema {value.key}: {e}")
547
- continue
548
-
549
- logger.info(f"Successfully loaded {len(schemas)} schema definitions")
613
+ logger.warning(f"Could not load schema {key}: {e}")
550
614
 
551
- # Generate descriptor using diagnose-structured-data prompt
552
- flow_api = api.flow().id(flow)
553
-
554
- logger.info("Calling TrustGraph diagnose-structured-data prompt for descriptor generation...")
555
- response = flow_api.prompt(
556
- id="diagnose-structured-data",
557
- variables={
558
- "schemas": schemas,
559
- "sample": sample_data
560
- }
561
- )
562
-
563
- # Parse the generated descriptor
564
- if isinstance(response, str):
565
- try:
566
- descriptor = json.loads(response)
567
- except json.JSONDecodeError:
568
- logger.error("Generated descriptor is not valid JSON")
569
- raise ValueError("Failed to generate valid descriptor")
570
- else:
571
- descriptor = response
572
-
573
- # Override schema_name if provided
574
- if schema_name:
575
- descriptor.setdefault('output', {})['schema_name'] = schema_name
576
-
577
- logger.info("Successfully generated descriptor from data sample")
578
-
579
- except ImportError as e:
580
- logger.error(f"Failed to import TrustGraph API: {e}")
581
- raise
582
- except Exception as e:
583
- logger.error(f"Failed to generate descriptor: {e}")
584
- raise
585
- else:
586
- # Load existing descriptor
587
- try:
588
- with open(descriptor_file, 'r', encoding='utf-8') as f:
589
- descriptor = json.load(f)
590
- logger.info(f"Loaded descriptor configuration from {descriptor_file}")
591
- except Exception as e:
592
- logger.error(f"Failed to load descriptor file: {e}")
593
- raise
615
+ if not schemas:
616
+ logger.error("No valid schemas could be loaded")
617
+ return None
618
+
619
+ # Use prompt service for schema selection
620
+ flow_api = api.flow().id("default")
621
+ prompt_client = flow_api.prompt()
594
622
 
595
- logger.info(f"Processing {input_file} for import...")
623
+ prompt = f"""Analyze this data sample and determine the best matching schema:
624
+
625
+ DATA SAMPLE:
626
+ {sample_data[:1000]}
627
+
628
+ AVAILABLE SCHEMAS:
629
+ {json.dumps(schemas, indent=2)}
630
+
631
+ Return ONLY the schema name (key) that best matches this data. Consider:
632
+ 1. Field names and types in the data
633
+ 2. Data structure and format
634
+ 3. Domain and use case alignment
635
+
636
+ Schema name:"""
637
+
638
+ response = prompt_client.schema_selection(
639
+ schemas=schemas,
640
+ sample=sample_data[:1000]
641
+ )
596
642
 
597
- # Parse data using the same logic as parse-only mode, but with full dataset
598
- try:
599
- format_info = descriptor.get('format', {})
600
- format_type = format_info.get('type', 'csv').lower()
601
- encoding = format_info.get('encoding', 'utf-8')
602
-
603
- logger.info(f"Input format: {format_type}, encoding: {encoding}")
604
-
605
- with open(input_file, 'r', encoding=encoding) as f:
606
- raw_data = f.read()
607
-
608
- logger.info(f"Read {len(raw_data)} characters from input file")
643
+ # Extract schema name from response
644
+ if isinstance(response, dict) and 'schema' in response:
645
+ return response['schema']
646
+ elif isinstance(response, str):
647
+ # Try to extract schema name from text response
648
+ response_lower = response.lower().strip()
649
+ for schema_key in schema_keys:
650
+ if schema_key.lower() in response_lower:
651
+ return schema_key
652
+
653
+ # If no exact match, try first mentioned schema
654
+ words = response.split()
655
+ for word in words:
656
+ clean_word = word.strip('.,!?":').lower()
657
+ if clean_word in [s.lower() for s in schema_keys]:
658
+ matching_schema = next(s for s in schema_keys if s.lower() == clean_word)
659
+ return matching_schema
660
+
661
+ logger.warning(f"Could not parse schema selection from response: {response}")
662
+
663
+ # Fallback: return first available schema
664
+ logger.info(f"Using fallback: first available schema '{schema_keys[0]}'")
665
+ return schema_keys[0]
666
+
667
+ except Exception as e:
668
+ logger.error(f"Schema discovery failed: {e}")
669
+ return None
670
+
671
+
672
+ def _auto_generate_descriptor(api_url, input_file, schema_name, sample_chars, logger):
673
+ """Auto-generate descriptor configuration for the discovered schema"""
674
+ try:
675
+ # Read sample data
676
+ with open(input_file, 'r', encoding='utf-8') as f:
677
+ sample_data = f.read(sample_chars)
678
+
679
+ # Import API modules
680
+ from trustgraph.api import Api
681
+ api = Api(api_url)
682
+ config_api = api.config()
683
+
684
+ # Get schema definition
685
+ schema_def = config_api.get("schema", schema_name)
686
+
687
+ # Use prompt service for descriptor generation
688
+ flow_api = api.flow().id("default")
689
+ prompt_client = flow_api.prompt()
690
+
691
+ response = prompt_client.diagnose_structured_data(
692
+ sample=sample_data,
693
+ schema_name=schema_name,
694
+ schema=schema_def
695
+ )
696
+
697
+ if isinstance(response, str):
698
+ try:
699
+ return json.loads(response)
700
+ except json.JSONDecodeError:
701
+ logger.error("Generated descriptor is not valid JSON")
702
+ return None
703
+ else:
704
+ return response
609
705
 
610
- except Exception as e:
611
- logger.error(f"Failed to read input file: {e}")
612
- raise
706
+ except Exception as e:
707
+ logger.error(f"Descriptor generation failed: {e}")
708
+ return None
709
+
710
+
711
+ def _auto_parse_preview(input_file, descriptor, max_records, logger):
712
+ """Parse and preview data using the auto-generated descriptor"""
713
+ try:
714
+ # Simplified parsing logic for preview (reuse existing logic)
715
+ format_info = descriptor.get('format', {})
716
+ format_type = format_info.get('type', 'csv').lower()
717
+ encoding = format_info.get('encoding', 'utf-8')
718
+
719
+ with open(input_file, 'r', encoding=encoding) as f:
720
+ raw_data = f.read()
613
721
 
614
- # Parse data (reuse parse-only logic but process all records)
615
722
  parsed_records = []
616
- batch_size = descriptor.get('output', {}).get('options', {}).get('batch_size', 1000)
617
723
 
618
724
  if format_type == 'csv':
619
725
  import csv
@@ -623,234 +729,50 @@ def load_structured_data(
623
729
  delimiter = options.get('delimiter', ',')
624
730
  has_header = options.get('has_header', True) or options.get('header', True)
625
731
 
626
- logger.info(f"CSV options - delimiter: '{delimiter}', has_header: {has_header}")
627
-
628
- try:
629
- reader = csv.DictReader(StringIO(raw_data), delimiter=delimiter)
630
- if not has_header:
631
- first_row = next(reader)
632
- fieldnames = [f"field_{i+1}" for i in range(len(first_row))]
633
- reader = csv.DictReader(StringIO(raw_data), fieldnames=fieldnames, delimiter=delimiter)
634
-
635
- record_count = 0
636
- for row in reader:
637
- parsed_records.append(row)
638
- record_count += 1
639
-
640
- # Process in batches to avoid memory issues
641
- if record_count % batch_size == 0:
642
- logger.info(f"Parsed {record_count} records...")
643
-
644
- except Exception as e:
645
- logger.error(f"Failed to parse CSV data: {e}")
646
- raise
732
+ reader = csv.DictReader(StringIO(raw_data), delimiter=delimiter)
733
+ if not has_header:
734
+ first_row = next(reader)
735
+ fieldnames = [f"field_{i+1}" for i in range(len(first_row))]
736
+ reader = csv.DictReader(StringIO(raw_data), fieldnames=fieldnames, delimiter=delimiter)
737
+
738
+ count = 0
739
+ for row in reader:
740
+ if count >= max_records:
741
+ break
742
+ parsed_records.append(dict(row))
743
+ count += 1
647
744
 
648
745
  elif format_type == 'json':
649
- try:
650
- data = json.loads(raw_data)
651
- if isinstance(data, list):
652
- parsed_records = data
653
- elif isinstance(data, dict):
654
- root_path = format_info.get('options', {}).get('root_path')
655
- if root_path:
656
- if root_path.startswith('$.'):
657
- key = root_path[2:]
658
- data = data.get(key, data)
659
-
660
- if isinstance(data, list):
661
- parsed_records = data
662
- else:
663
- parsed_records = [data]
664
-
665
- except Exception as e:
666
- logger.error(f"Failed to parse JSON data: {e}")
667
- raise
668
-
669
- elif format_type == 'xml':
670
- import xml.etree.ElementTree as ET
746
+ import json
747
+ data = json.loads(raw_data)
671
748
 
672
- options = format_info.get('options', {})
673
- record_path = options.get('record_path', '//record')
674
- field_attribute = options.get('field_attribute')
675
-
676
- # Legacy support for old options format
677
- if 'root_element' in options or 'record_element' in options:
678
- root_element = options.get('root_element')
679
- record_element = options.get('record_element', 'record')
680
- if root_element:
681
- record_path = f"//{root_element}/{record_element}"
682
- else:
683
- record_path = f"//{record_element}"
684
-
685
- logger.info(f"XML options - record_path: '{record_path}', field_attribute: '{field_attribute}'")
686
-
687
- try:
688
- root = ET.fromstring(raw_data)
689
-
690
- # Find record elements using XPath
691
- xpath_expr = record_path
692
- if xpath_expr.startswith('/ROOT/'):
693
- xpath_expr = xpath_expr[6:]
694
- elif xpath_expr.startswith('/'):
695
- xpath_expr = '.' + xpath_expr
696
-
697
- records = root.findall(xpath_expr)
698
- logger.info(f"Found {len(records)} records using XPath: {record_path} (converted to: {xpath_expr})")
699
-
700
- # Convert XML elements to dictionaries
701
- for element in records:
702
- record = {}
703
-
704
- if field_attribute:
705
- # Handle field elements with name attributes (UN data format)
706
- for child in element:
707
- if child.tag == 'field' and field_attribute in child.attrib:
708
- field_name = child.attrib[field_attribute]
709
- field_value = child.text.strip() if child.text else ""
710
- record[field_name] = field_value
711
- else:
712
- # Handle standard XML structure
713
- record.update(element.attrib)
714
-
715
- for child in element:
716
- if child.text:
717
- record[child.tag] = child.text.strip()
718
- else:
719
- record[child.tag] = ""
720
-
721
- if not record and element.text:
722
- record['value'] = element.text.strip()
723
-
724
- parsed_records.append(record)
725
-
726
- except ET.ParseError as e:
727
- logger.error(f"Failed to parse XML data: {e}")
728
- raise
729
- except Exception as e:
730
- logger.error(f"Failed to process XML data: {e}")
731
- raise
749
+ if isinstance(data, list):
750
+ parsed_records = data[:max_records]
751
+ else:
752
+ parsed_records = [data]
732
753
 
733
- else:
734
- raise ValueError(f"Unsupported format type: {format_type}")
735
-
736
- logger.info(f"Successfully parsed {len(parsed_records)} records")
737
-
738
- # Apply transformations and create TrustGraph objects
754
+ # Apply basic field mappings for preview
739
755
  mappings = descriptor.get('mappings', [])
740
- processed_records = []
741
- schema_name = descriptor.get('output', {}).get('schema_name', 'default')
742
- confidence = descriptor.get('output', {}).get('options', {}).get('confidence', 0.9)
756
+ preview_records = []
743
757
 
744
- logger.info(f"Applying {len(mappings)} field mappings...")
745
-
746
- for record_num, record in enumerate(parsed_records, start=1):
758
+ for record in parsed_records:
747
759
  processed_record = {}
748
-
749
760
  for mapping in mappings:
750
- source_field = mapping.get('source_field') or mapping.get('source')
751
- target_field = mapping.get('target_field') or mapping.get('target')
761
+ source_field = mapping.get('source_field')
762
+ target_field = mapping.get('target_field', source_field)
752
763
 
753
764
  if source_field in record:
754
765
  value = record[source_field]
755
-
756
- # Apply transforms
757
- transforms = mapping.get('transforms', [])
758
- for transform in transforms:
759
- transform_type = transform.get('type')
760
-
761
- if transform_type == 'trim' and isinstance(value, str):
762
- value = value.strip()
763
- elif transform_type == 'upper' and isinstance(value, str):
764
- value = value.upper()
765
- elif transform_type == 'lower' and isinstance(value, str):
766
- value = value.lower()
767
- elif transform_type == 'title_case' and isinstance(value, str):
768
- value = value.title()
769
- elif transform_type == 'to_int':
770
- try:
771
- value = int(value) if value != '' else None
772
- except (ValueError, TypeError):
773
- logger.warning(f"Failed to convert '{value}' to int in record {record_num}")
774
- elif transform_type == 'to_float':
775
- try:
776
- value = float(value) if value != '' else None
777
- except (ValueError, TypeError):
778
- logger.warning(f"Failed to convert '{value}' to float in record {record_num}")
779
-
780
- # Convert all values to strings as required by ExtractedObject schema
781
766
  processed_record[target_field] = str(value) if value is not None else ""
782
- else:
783
- logger.warning(f"Source field '{source_field}' not found in record {record_num}")
784
-
785
- # Create TrustGraph ExtractedObject
786
- output_record = {
787
- "metadata": {
788
- "id": f"import-{record_num}",
789
- "metadata": [],
790
- "user": "trustgraph",
791
- "collection": "default"
792
- },
793
- "schema_name": schema_name,
794
- "values": processed_record,
795
- "confidence": confidence,
796
- "source_span": ""
797
- }
798
- processed_records.append(output_record)
799
-
800
- logger.info(f"Processed {len(processed_records)} records with transformations")
801
-
802
- if dry_run:
803
- print(f"Dry run mode - would import {len(processed_records)} records to TrustGraph")
804
- print(f"Target schema: {schema_name}")
805
- print(f"Sample record:")
806
- if processed_records:
807
- print(json.dumps(processed_records[0], indent=2))
808
- return
767
+
768
+ if processed_record: # Only add if we got some data
769
+ preview_records.append(processed_record)
809
770
 
810
- # Import to TrustGraph using objects import endpoint via WebSocket
811
- logger.info(f"Importing {len(processed_records)} records to TrustGraph...")
771
+ return preview_records if preview_records else parsed_records
812
772
 
813
- try:
814
- import asyncio
815
- from websockets.asyncio.client import connect
816
-
817
- # Construct objects import URL similar to load_knowledge pattern
818
- if not api_url.endswith("/"):
819
- api_url += "/"
820
-
821
- # Convert HTTP URL to WebSocket URL if needed
822
- ws_url = api_url.replace("http://", "ws://").replace("https://", "wss://")
823
- objects_url = ws_url + f"api/v1/flow/{flow}/import/objects"
824
-
825
- logger.info(f"Connecting to objects import endpoint: {objects_url}")
826
-
827
- async def import_objects():
828
- async with connect(objects_url) as ws:
829
- imported_count = 0
830
-
831
- for record in processed_records:
832
- # Send individual ExtractedObject records
833
- await ws.send(json.dumps(record))
834
- imported_count += 1
835
-
836
- if imported_count % 100 == 0:
837
- logger.info(f"Imported {imported_count}/{len(processed_records)} records...")
838
-
839
- logger.info(f"Successfully imported {imported_count} records to TrustGraph")
840
- return imported_count
841
-
842
- # Run the async import
843
- imported_count = asyncio.run(import_objects())
844
- print(f"Import completed: {imported_count} records imported to schema '{schema_name}'")
845
-
846
- except ImportError as e:
847
- logger.error(f"Failed to import required modules: {e}")
848
- print(f"Error: Required modules not available - {e}")
849
- raise
850
- except Exception as e:
851
- logger.error(f"Failed to import data to TrustGraph: {e}")
852
- print(f"Import failed: {e}")
853
- raise
773
+ except Exception as e:
774
+ logger.error(f"Preview parsing failed: {e}")
775
+ return None
854
776
 
855
777
 
856
778
  def main():
@@ -881,26 +803,29 @@ Examples:
881
803
  %(prog)s --input customers.csv --descriptor descriptor.json
882
804
  %(prog)s --input products.xml --descriptor xml_descriptor.json
883
805
 
884
- # All-in-one: Auto-generate descriptor and import (for simple cases)
885
- %(prog)s --input customers.csv --schema-name customer
806
+ # FULLY AUTOMATIC: Discover schema + generate descriptor + import (zero manual steps!)
807
+ %(prog)s --input customers.csv --auto
808
+ %(prog)s --input products.xml --auto --dry-run # Preview before importing
886
809
 
887
810
  # Dry run to validate without importing
888
811
  %(prog)s --input customers.csv --descriptor descriptor.json --dry-run
889
812
 
890
813
  Use Cases:
814
+ --auto : 🚀 FULLY AUTOMATIC: Discover schema + generate descriptor + import data
815
+ (zero manual configuration required!)
891
816
  --suggest-schema : Diagnose which TrustGraph schemas might match your data
892
817
  (uses --sample-chars to limit data sent for analysis)
893
818
  --generate-descriptor: Create/review the structured data language configuration
894
819
  (uses --sample-chars to limit data sent for analysis)
895
820
  --parse-only : Validate that parsed data looks correct before import
896
821
  (uses --sample-size to limit records processed, ignores --sample-chars)
897
- (no mode flags) : Full pipeline - parse and import to TrustGraph
898
822
 
899
823
  For more information on the descriptor format, see:
900
824
  docs/tech-specs/structured-data-descriptor.md
901
- """.strip()
825
+ """,
902
826
  )
903
827
 
828
+ # Required arguments
904
829
  parser.add_argument(
905
830
  '-u', '--api-url',
906
831
  default=default_url,
@@ -941,6 +866,11 @@ For more information on the descriptor format, see:
941
866
  action='store_true',
942
867
  help='Parse data using descriptor but don\'t import to TrustGraph'
943
868
  )
869
+ mode_group.add_argument(
870
+ '--auto',
871
+ action='store_true',
872
+ help='Run full automatic pipeline: discover schema + generate descriptor + import data'
873
+ )
944
874
 
945
875
  parser.add_argument(
946
876
  '-o', '--output',
@@ -999,7 +929,12 @@ For more information on the descriptor format, see:
999
929
 
1000
930
  args = parser.parse_args()
1001
931
 
1002
- # Validate argument combinations
932
+ # Input validation
933
+ if not os.path.exists(args.input):
934
+ print(f"Error: Input file not found: {args.input}", file=sys.stderr)
935
+ sys.exit(1)
936
+
937
+ # Mode-specific validation
1003
938
  if args.parse_only and not args.descriptor:
1004
939
  print("Error: --descriptor is required when using --parse-only", file=sys.stderr)
1005
940
  sys.exit(1)
@@ -1011,11 +946,15 @@ For more information on the descriptor format, see:
1011
946
  if (args.suggest_schema or args.generate_descriptor) and args.sample_size != 100: # 100 is default
1012
947
  print("Warning: --sample-size is ignored in analysis modes, use --sample-chars instead", file=sys.stderr)
1013
948
 
1014
- if not any([args.suggest_schema, args.generate_descriptor, args.parse_only]) and not args.descriptor:
1015
- # Full pipeline mode without descriptor - schema_name should be provided
1016
- if not args.schema_name:
1017
- print("Error: --descriptor or --schema-name is required for full import", file=sys.stderr)
1018
- sys.exit(1)
949
+ # Require explicit mode selection - no implicit behavior
950
+ if not any([args.suggest_schema, args.generate_descriptor, args.parse_only, args.auto]):
951
+ print("Error: Must specify an operation mode", file=sys.stderr)
952
+ print("Available modes:", file=sys.stderr)
953
+ print(" --auto : Discover schema + generate descriptor + import", file=sys.stderr)
954
+ print(" --suggest-schema : Analyze data and suggest schemas", file=sys.stderr)
955
+ print(" --generate-descriptor : Generate descriptor from data", file=sys.stderr)
956
+ print(" --parse-only : Parse data without importing", file=sys.stderr)
957
+ sys.exit(1)
1019
958
 
1020
959
  try:
1021
960
  load_structured_data(
@@ -1025,6 +964,7 @@ For more information on the descriptor format, see:
1025
964
  suggest_schema=args.suggest_schema,
1026
965
  generate_descriptor=args.generate_descriptor,
1027
966
  parse_only=args.parse_only,
967
+ auto=args.auto,
1028
968
  output_file=args.output,
1029
969
  sample_size=args.sample_size,
1030
970
  sample_chars=args.sample_chars,
@@ -0,0 +1 @@
1
+ __version__ = "1.3.13"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: trustgraph-cli
3
- Version: 1.3.11
3
+ Version: 1.3.13
4
4
  Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
5
5
  Author-email: "trustgraph.ai" <security@trustgraph.ai>
6
6
  Project-URL: Homepage, https://github.com/trustgraph-ai/trustgraph
@@ -1 +0,0 @@
1
- __version__ = "1.3.11"