PyPI - trustgraph-cli - Versions diffs - 1.3.12__tar.gz → 1.3.13__tar.gz - Mend

trustgraph-cli 1.3.12tar.gz → 1.3.13tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of trustgraph-cli might be problematic. Click here for more details.

Files changed (72) hide show

{trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: trustgraph-cli
-Version: 1.3.12
+Version: 1.3.13
 Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
 Author-email: "trustgraph.ai" <security@trustgraph.ai>
 Project-URL: Homepage, https://github.com/trustgraph-ai/trustgraph

{trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph/cli/load_structured_data.py RENAMED Viewed

@@ -31,6 +31,7 @@ def load_structured_data(
     suggest_schema: bool = False,
     generate_descriptor: bool = False,
     parse_only: bool = False,
+    auto: bool = False,
     output_file: str = None,
     sample_size: int = 100,
     sample_chars: int = 500,
@@ -49,6 +50,7 @@ def load_structured_data(
         suggest_schema: Analyze data and suggest matching schemas
         generate_descriptor: Generate descriptor from data sample
         parse_only: Parse data but don't import to TrustGraph
+        auto: Run full automatic pipeline (suggest schema + generate descriptor + import)
         output_file: Path to write output (descriptor/parsed data)
         sample_size: Number of records to sample for analysis
         sample_chars: Maximum characters to read for sampling
@@ -62,7 +64,90 @@ def load_structured_data(
         logging.basicConfig(level=logging.INFO)
     # Determine operation mode
-    if suggest_schema:
+    if auto:
+        logger.info(f"🚀 Starting automatic pipeline for {input_file}...")
+        logger.info("Step 1: Analyzing data to discover best matching schema...")
+        # Step 1: Auto-discover schema (reuse suggest_schema logic)
+        discovered_schema = _auto_discover_schema(api_url, input_file, sample_chars, logger)
+        if not discovered_schema:
+            logger.error("Failed to discover suitable schema automatically")
+            print("❌ Could not automatically determine the best schema for your data.")
+            print("💡 Try running with --suggest-schema first to see available options.")
+            return None
+        logger.info(f"✅ Discovered schema: {discovered_schema}")
+        print(f"🎯 Auto-selected schema: {discovered_schema}")
+        # Step 2: Auto-generate descriptor
+        logger.info("Step 2: Generating descriptor configuration...")
+        auto_descriptor = _auto_generate_descriptor(api_url, input_file, discovered_schema, sample_chars, logger)
+        if not auto_descriptor:
+            logger.error("Failed to generate descriptor automatically")
+            print("❌ Could not automatically generate descriptor configuration.")
+            return None
+        logger.info("✅ Generated descriptor configuration")
+        print("📝 Generated descriptor configuration")
+        # Step 3: Parse and preview data
+        logger.info("Step 3: Parsing and validating data...")
+        preview_records = _auto_parse_preview(input_file, auto_descriptor, min(sample_size, 5), logger)
+        if preview_records is None:
+            logger.error("Failed to parse data with generated descriptor")
+            print("❌ Could not parse data with generated descriptor.")
+            return None
+        # Show preview
+        print("📊 Data Preview (first few records):")
+        print("=" * 50)
+        for i, record in enumerate(preview_records[:3], 1):
+            print(f"Record {i}: {record}")
+        print("=" * 50)
+        # Step 4: Import (unless dry_run)
+        if dry_run:
+            logger.info("✅ Dry run complete - data is ready for import")
+            print("✅ Dry run successful! Data is ready for import.")
+            print(f"💡 Run without --dry-run to import {len(preview_records)} records to TrustGraph.")
+            return None
+        else:
+            logger.info("Step 4: Importing data to TrustGraph...")
+            print("🚀 Importing data to TrustGraph...")
+            # Recursively call ourselves with the auto-generated descriptor
+            # This reuses all the existing import logic
+            import tempfile
+            import os
+            # Save auto-generated descriptor to temp file
+            temp_descriptor = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False)
+            json.dump(auto_descriptor, temp_descriptor, indent=2)
+            temp_descriptor.close()
+            try:
+                # Call the full pipeline mode with our auto-generated descriptor
+                result = load_structured_data(
+                    api_url=api_url,
+                    input_file=input_file,
+                    descriptor_file=temp_descriptor.name,
+                    flow=flow,
+                    dry_run=False,  # We already handled dry_run above
+                    verbose=verbose
+                )
+                print("✅ Auto-import completed successfully!")
+                logger.info("Auto-import pipeline completed successfully")
+                return result
+            finally:
+                # Clean up temp descriptor file
+                try:
+                    os.unlink(temp_descriptor.name)
+                except:
+                    pass
+    elif suggest_schema:
         logger.info(f"Analyzing {input_file} to suggest schemas...")
         logger.info(f"Sample size: {sample_size} records")
         logger.info(f"Sample chars: {sample_chars} characters")
@@ -497,123 +582,144 @@ def load_structured_data(
         print(f"- Records processed: {len(output_records)}")
         print(f"- Target schema: {schema_name}")
         print(f"- Field mappings: {len(mappings)}")
-    else:
-        # Full pipeline: parse and import
-        if not descriptor_file:
-            # Auto-generate descriptor if not provided
-            logger.info("No descriptor provided, auto-generating...")
-            logger.info(f"Schema name: {schema_name}")
-            # Read sample data for descriptor generation
+# Helper functions for auto mode
+def _auto_discover_schema(api_url, input_file, sample_chars, logger):
+    """Auto-discover the best matching schema for the input data"""
+    try:
+        # Read sample data
+        with open(input_file, 'r', encoding='utf-8') as f:
+            sample_data = f.read(sample_chars)
+        # Import API modules
+        from trustgraph.api import Api
+        api = Api(api_url)
+        config_api = api.config()
+        # Get available schemas
+        schema_keys = config_api.list("schema")
+        if not schema_keys:
+            logger.error("No schemas available in TrustGraph configuration")
+            return None
+        # Get schema definitions
+        schemas = {}
+        for key in schema_keys:
             try:
-                with open(input_file, 'r', encoding='utf-8') as f:
-                    sample_data = f.read(sample_chars)
-                    logger.info(f"Read {len(sample_data)} characters for descriptor generation")
+                schema_def = config_api.get("schema", key)
+                schemas[key] = schema_def
             except Exception as e:
-                logger.error(f"Failed to read input file for descriptor generation: {e}")
-                raise
-            # Generate descriptor using TrustGraph prompt service
-            try:
-                from trustgraph.api import Api
-                from trustgraph.api.types import ConfigKey
-                api = Api(api_url)
-                config_api = api.config()
-                # Get available schemas
-                logger.info("Fetching available schemas for descriptor generation...")
-                schema_keys = config_api.list("schema")
-                logger.info(f"Found {len(schema_keys)} schemas: {schema_keys}")
-                if not schema_keys:
-                    logger.warning("No schemas found in configuration")
-                    print("No schemas available in TrustGraph configuration")
-                    return
-                # Fetch each schema definition
-                schemas = []
-                config_keys = [ConfigKey(type="schema", key=key) for key in schema_keys]
-                schema_values = config_api.get(config_keys)
-                for value in schema_values:
-                    try:
-                        schema_def = json.loads(value.value) if isinstance(value.value, str) else value.value
-                        schemas.append(schema_def)
-                        logger.debug(f"Loaded schema: {value.key}")
-                    except json.JSONDecodeError as e:
-                        logger.warning(f"Failed to parse schema {value.key}: {e}")
-                        continue
-                logger.info(f"Successfully loaded {len(schemas)} schema definitions")
-                # Generate descriptor using diagnose-structured-data prompt
-                flow_api = api.flow().id(flow)
+                logger.warning(f"Could not load schema {key}: {e}")
-                logger.info("Calling TrustGraph diagnose-structured-data prompt for descriptor generation...")
-                response = flow_api.prompt(
-                    id="diagnose-structured-data",
-                    variables={
-                        "schemas": schemas,
-                        "sample": sample_data
-                    }
-                )
-                # Parse the generated descriptor
-                if isinstance(response, str):
-                    try:
-                        descriptor = json.loads(response)
-                    except json.JSONDecodeError:
-                        logger.error("Generated descriptor is not valid JSON")
-                        raise ValueError("Failed to generate valid descriptor")
-                else:
-                    descriptor = response
-                # Override schema_name if provided
-                if schema_name:
-                    descriptor.setdefault('output', {})['schema_name'] = schema_name
-                logger.info("Successfully generated descriptor from data sample")
-            except ImportError as e:
-                logger.error(f"Failed to import TrustGraph API: {e}")
-                raise
-            except Exception as e:
-                logger.error(f"Failed to generate descriptor: {e}")
-                raise
-        else:
-            # Load existing descriptor
-            try:
-                with open(descriptor_file, 'r', encoding='utf-8') as f:
-                    descriptor = json.load(f)
-                logger.info(f"Loaded descriptor configuration from {descriptor_file}")
-            except Exception as e:
-                logger.error(f"Failed to load descriptor file: {e}")
-                raise
+        if not schemas:
+            logger.error("No valid schemas could be loaded")
+            return None
+        # Use prompt service for schema selection
+        flow_api = api.flow().id("default")
+        prompt_client = flow_api.prompt()
-        logger.info(f"Processing {input_file} for import...")
+        prompt = f"""Analyze this data sample and determine the best matching schema:
+DATA SAMPLE:
+{sample_data[:1000]}
+AVAILABLE SCHEMAS:
+{json.dumps(schemas, indent=2)}
+Return ONLY the schema name (key) that best matches this data. Consider:
+1. Field names and types in the data
+2. Data structure and format
+3. Domain and use case alignment
+Schema name:"""
+        response = prompt_client.schema_selection(
+            schemas=schemas,
+            sample=sample_data[:1000]
+        )
-        # Parse data using the same logic as parse-only mode, but with full dataset
-        try:
-            format_info = descriptor.get('format', {})
-            format_type = format_info.get('type', 'csv').lower()
-            encoding = format_info.get('encoding', 'utf-8')
-            logger.info(f"Input format: {format_type}, encoding: {encoding}")
-            with open(input_file, 'r', encoding=encoding) as f:
-                raw_data = f.read()
-            logger.info(f"Read {len(raw_data)} characters from input file")
+        # Extract schema name from response
+        if isinstance(response, dict) and 'schema' in response:
+            return response['schema']
+        elif isinstance(response, str):
+            # Try to extract schema name from text response
+            response_lower = response.lower().strip()
+            for schema_key in schema_keys:
+                if schema_key.lower() in response_lower:
+                    return schema_key
+            # If no exact match, try first mentioned schema
+            words = response.split()
+            for word in words:
+                clean_word = word.strip('.,!?":').lower()
+                if clean_word in [s.lower() for s in schema_keys]:
+                    matching_schema = next(s for s in schema_keys if s.lower() == clean_word)
+                    return matching_schema
+        logger.warning(f"Could not parse schema selection from response: {response}")
+        # Fallback: return first available schema
+        logger.info(f"Using fallback: first available schema '{schema_keys[0]}'")
+        return schema_keys[0]
+    except Exception as e:
+        logger.error(f"Schema discovery failed: {e}")
+        return None
+def _auto_generate_descriptor(api_url, input_file, schema_name, sample_chars, logger):
+    """Auto-generate descriptor configuration for the discovered schema"""
+    try:
+        # Read sample data
+        with open(input_file, 'r', encoding='utf-8') as f:
+            sample_data = f.read(sample_chars)
+        # Import API modules
+        from trustgraph.api import Api
+        api = Api(api_url)
+        config_api = api.config()
+        # Get schema definition
+        schema_def = config_api.get("schema", schema_name)
+        # Use prompt service for descriptor generation
+        flow_api = api.flow().id("default")
+        prompt_client = flow_api.prompt()
+        response = prompt_client.diagnose_structured_data(
+            sample=sample_data,
+            schema_name=schema_name,
+            schema=schema_def
+        )
+        if isinstance(response, str):
+            try:
+                return json.loads(response)
+            except json.JSONDecodeError:
+                logger.error("Generated descriptor is not valid JSON")
+                return None
+        else:
+            return response
-        except Exception as e:
-            logger.error(f"Failed to read input file: {e}")
-            raise
+    except Exception as e:
+        logger.error(f"Descriptor generation failed: {e}")
+        return None
+def _auto_parse_preview(input_file, descriptor, max_records, logger):
+    """Parse and preview data using the auto-generated descriptor"""
+    try:
+        # Simplified parsing logic for preview (reuse existing logic)
+        format_info = descriptor.get('format', {})
+        format_type = format_info.get('type', 'csv').lower()
+        encoding = format_info.get('encoding', 'utf-8')
+        with open(input_file, 'r', encoding=encoding) as f:
+            raw_data = f.read()
-        # Parse data (reuse parse-only logic but process all records)
         parsed_records = []
-        batch_size = descriptor.get('output', {}).get('options', {}).get('batch_size', 1000)
         if format_type == 'csv':
             import csv
@@ -623,261 +729,50 @@ def load_structured_data(
             delimiter = options.get('delimiter', ',')
             has_header = options.get('has_header', True) or options.get('header', True)
-            logger.info(f"CSV options - delimiter: '{delimiter}', has_header: {has_header}")
-            try:
-                reader = csv.DictReader(StringIO(raw_data), delimiter=delimiter)
-                if not has_header:
-                    first_row = next(reader)
-                    fieldnames = [f"field_{i+1}" for i in range(len(first_row))]
-                    reader = csv.DictReader(StringIO(raw_data), fieldnames=fieldnames, delimiter=delimiter)
-                record_count = 0
-                for row in reader:
-                    parsed_records.append(row)
-                    record_count += 1
-                    # Process in batches to avoid memory issues
-                    if record_count % batch_size == 0:
-                        logger.info(f"Parsed {record_count} records...")
-            except Exception as e:
-                logger.error(f"Failed to parse CSV data: {e}")
-                raise
+            reader = csv.DictReader(StringIO(raw_data), delimiter=delimiter)
+            if not has_header:
+                first_row = next(reader)
+                fieldnames = [f"field_{i+1}" for i in range(len(first_row))]
+                reader = csv.DictReader(StringIO(raw_data), fieldnames=fieldnames, delimiter=delimiter)
+            count = 0
+            for row in reader:
+                if count >= max_records:
+                    break
+                parsed_records.append(dict(row))
+                count += 1
         elif format_type == 'json':
-            try:
-                data = json.loads(raw_data)
-                if isinstance(data, list):
-                    parsed_records = data
-                elif isinstance(data, dict):
-                    root_path = format_info.get('options', {}).get('root_path')
-                    if root_path:
-                        if root_path.startswith('$.'):
-                            key = root_path[2:]
-                            data = data.get(key, data)
-                    if isinstance(data, list):
-                        parsed_records = data
-                    else:
-                        parsed_records = [data]
-            except Exception as e:
-                logger.error(f"Failed to parse JSON data: {e}")
-                raise
-        elif format_type == 'xml':
-            import xml.etree.ElementTree as ET
-            options = format_info.get('options', {})
-            record_path = options.get('record_path', '//record')
-            field_attribute = options.get('field_attribute')
+            import json
+            data = json.loads(raw_data)
-            # Legacy support for old options format
-            if 'root_element' in options or 'record_element' in options:
-                root_element = options.get('root_element')
-                record_element = options.get('record_element', 'record')
-                if root_element:
-                    record_path = f"//{root_element}/{record_element}"
-                else:
-                    record_path = f"//{record_element}"
-            logger.info(f"XML options - record_path: '{record_path}', field_attribute: '{field_attribute}'")
-            try:
-                root = ET.fromstring(raw_data)
-                # Find record elements using XPath
-                xpath_expr = record_path
-                if xpath_expr.startswith('/ROOT/'):
-                    xpath_expr = xpath_expr[6:]
-                elif xpath_expr.startswith('/'):
-                    xpath_expr = '.' + xpath_expr
-                records = root.findall(xpath_expr)
-                logger.info(f"Found {len(records)} records using XPath: {record_path} (converted to: {xpath_expr})")
-                # Convert XML elements to dictionaries
-                for element in records:
-                    record = {}
-                    if field_attribute:
-                        # Handle field elements with name attributes (UN data format)
-                        for child in element:
-                            if child.tag == 'field' and field_attribute in child.attrib:
-                                field_name = child.attrib[field_attribute]
-                                field_value = child.text.strip() if child.text else ""
-                                record[field_name] = field_value
-                    else:
-                        # Handle standard XML structure
-                        record.update(element.attrib)
-                        for child in element:
-                            if child.text:
-                                record[child.tag] = child.text.strip()
-                            else:
-                                record[child.tag] = ""
-                        if not record and element.text:
-                            record['value'] = element.text.strip()
-                    parsed_records.append(record)
-            except ET.ParseError as e:
-                logger.error(f"Failed to parse XML data: {e}")
-                raise
-            except Exception as e:
-                logger.error(f"Failed to process XML data: {e}")
-                raise
+            if isinstance(data, list):
+                parsed_records = data[:max_records]
+            else:
+                parsed_records = [data]
-        else:
-            raise ValueError(f"Unsupported format type: {format_type}")
-        logger.info(f"Successfully parsed {len(parsed_records)} records")
-        # Apply transformations and create TrustGraph objects
+        # Apply basic field mappings for preview
         mappings = descriptor.get('mappings', [])
-        processed_records = []
-        schema_name = descriptor.get('output', {}).get('schema_name', 'default')
-        confidence = descriptor.get('output', {}).get('options', {}).get('confidence', 0.9)
-        logger.info(f"Applying {len(mappings)} field mappings...")
+        preview_records = []
-        for record_num, record in enumerate(parsed_records, start=1):
+        for record in parsed_records:
             processed_record = {}
             for mapping in mappings:
-                source_field = mapping.get('source_field') or mapping.get('source')
-                target_field = mapping.get('target_field') or mapping.get('target')
+                source_field = mapping.get('source_field')
+                target_field = mapping.get('target_field', source_field)
                 if source_field in record:
                     value = record[source_field]
-                    # Apply transforms
-                    transforms = mapping.get('transforms', [])
-                    for transform in transforms:
-                        transform_type = transform.get('type')
-                        if transform_type == 'trim' and isinstance(value, str):
-                            value = value.strip()
-                        elif transform_type == 'upper' and isinstance(value, str):
-                            value = value.upper()
-                        elif transform_type == 'lower' and isinstance(value, str):
-                            value = value.lower()
-                        elif transform_type == 'title_case' and isinstance(value, str):
-                            value = value.title()
-                        elif transform_type == 'to_int':
-                            try:
-                                value = int(value) if value != '' else None
-                            except (ValueError, TypeError):
-                                logger.warning(f"Failed to convert '{value}' to int in record {record_num}")
-                        elif transform_type == 'to_float':
-                            try:
-                                value = float(value) if value != '' else None
-                            except (ValueError, TypeError):
-                                logger.warning(f"Failed to convert '{value}' to float in record {record_num}")
-                    # Convert all values to strings as required by ExtractedObject schema
                     processed_record[target_field] = str(value) if value is not None else ""
-                else:
-                    logger.warning(f"Source field '{source_field}' not found in record {record_num}")
-            # Create TrustGraph ExtractedObject
-            output_record = {
-                "metadata": {
-                    "id": f"import-{record_num}",
-                    "metadata": [],
-                    "user": "trustgraph",
-                    "collection": "default"
-                },
-                "schema_name": schema_name,
-                "values": processed_record,
-                "confidence": confidence,
-                "source_span": ""
-            }
-            processed_records.append(output_record)
-        logger.info(f"Processed {len(processed_records)} records with transformations")
-        if dry_run:
-            print(f"Dry run mode - would import {len(processed_records)} records to TrustGraph")
-            print(f"Target schema: {schema_name}")
-            print(f"Sample record:")
-            if processed_records:
-                # Show what the batched format will look like
-                sample_batch = processed_records[:min(3, len(processed_records))]
-                batch_values = [record["values"] for record in sample_batch]
-                first_record = processed_records[0]
-                batched_sample = {
-                    "metadata": first_record["metadata"],
-                    "schema_name": first_record["schema_name"],
-                    "values": batch_values,
-                    "confidence": first_record["confidence"],
-                    "source_span": first_record["source_span"]
-                }
-                print(json.dumps(batched_sample, indent=2))
-            return
+            if processed_record:  # Only add if we got some data
+                preview_records.append(processed_record)
-        # Import to TrustGraph using objects import endpoint via WebSocket
-        logger.info(f"Importing {len(processed_records)} records to TrustGraph...")
+        return preview_records if preview_records else parsed_records
-        try:
-            import asyncio
-            from websockets.asyncio.client import connect
-            # Construct objects import URL similar to load_knowledge pattern
-            if not api_url.endswith("/"):
-                api_url += "/"
-            # Convert HTTP URL to WebSocket URL if needed
-            ws_url = api_url.replace("http://", "ws://").replace("https://", "wss://")
-            objects_url = ws_url + f"api/v1/flow/{flow}/import/objects"
-            logger.info(f"Connecting to objects import endpoint: {objects_url}")
-            async def import_objects():
-                async with connect(objects_url) as ws:
-                    imported_count = 0
-                    # Process records in batches
-                    for i in range(0, len(processed_records), batch_size):
-                        batch_records = processed_records[i:i + batch_size]
-                        # Extract values from each record in the batch
-                        batch_values = [record["values"] for record in batch_records]
-                        # Create batched ExtractedObject message using first record as template
-                        first_record = batch_records[0]
-                        batched_record = {
-                            "metadata": first_record["metadata"],
-                            "schema_name": first_record["schema_name"],
-                            "values": batch_values,  # Array of value dictionaries
-                            "confidence": first_record["confidence"],
-                            "source_span": first_record["source_span"]
-                        }
-                        # Send batched ExtractedObject
-                        await ws.send(json.dumps(batched_record))
-                        imported_count += len(batch_records)
-                        if imported_count % 100 == 0:
-                            logger.info(f"Imported {imported_count}/{len(processed_records)} records...")
-                    logger.info(f"Successfully imported {imported_count} records to TrustGraph")
-                    return imported_count
-            # Run the async import
-            imported_count = asyncio.run(import_objects())
-            print(f"Import completed: {imported_count} records imported to schema '{schema_name}'")
-        except ImportError as e:
-            logger.error(f"Failed to import required modules: {e}")
-            print(f"Error: Required modules not available - {e}")
-            raise
-        except Exception as e:
-            logger.error(f"Failed to import data to TrustGraph: {e}")
-            print(f"Import failed: {e}")
-            raise
+    except Exception as e:
+        logger.error(f"Preview parsing failed: {e}")
+        return None
 def main():
@@ -908,26 +803,29 @@ Examples:
   %(prog)s --input customers.csv --descriptor descriptor.json
   %(prog)s --input products.xml --descriptor xml_descriptor.json
-  # All-in-one: Auto-generate descriptor and import (for simple cases)
-  %(prog)s --input customers.csv --schema-name customer
+  # FULLY AUTOMATIC: Discover schema + generate descriptor + import (zero manual steps!)
+  %(prog)s --input customers.csv --auto
+  %(prog)s --input products.xml --auto --dry-run  # Preview before importing
   # Dry run to validate without importing
   %(prog)s --input customers.csv --descriptor descriptor.json --dry-run
 Use Cases:
+  --auto              : 🚀 FULLY AUTOMATIC: Discover schema + generate descriptor + import data
+                        (zero manual configuration required!)
   --suggest-schema     : Diagnose which TrustGraph schemas might match your data
                         (uses --sample-chars to limit data sent for analysis)
   --generate-descriptor: Create/review the structured data language configuration
                         (uses --sample-chars to limit data sent for analysis)
   --parse-only        : Validate that parsed data looks correct before import
                         (uses --sample-size to limit records processed, ignores --sample-chars)
-  (no mode flags)     : Full pipeline - parse and import to TrustGraph
 For more information on the descriptor format, see:
   docs/tech-specs/structured-data-descriptor.md
-        """.strip()
+""",
     )
+    # Required arguments
     parser.add_argument(
         '-u', '--api-url',
         default=default_url,
@@ -968,6 +866,11 @@ For more information on the descriptor format, see:
         action='store_true',
         help='Parse data using descriptor but don\'t import to TrustGraph'
     )
+    mode_group.add_argument(
+        '--auto',
+        action='store_true',
+        help='Run full automatic pipeline: discover schema + generate descriptor + import data'
+    )
     parser.add_argument(
         '-o', '--output',
@@ -1026,7 +929,12 @@ For more information on the descriptor format, see:
     args = parser.parse_args()
-    # Validate argument combinations
+    # Input validation
+    if not os.path.exists(args.input):
+        print(f"Error: Input file not found: {args.input}", file=sys.stderr)
+        sys.exit(1)
+    # Mode-specific validation
     if args.parse_only and not args.descriptor:
         print("Error: --descriptor is required when using --parse-only", file=sys.stderr)
         sys.exit(1)
@@ -1038,11 +946,15 @@ For more information on the descriptor format, see:
     if (args.suggest_schema or args.generate_descriptor) and args.sample_size != 100:  # 100 is default
         print("Warning: --sample-size is ignored in analysis modes, use --sample-chars instead", file=sys.stderr)
-    if not any([args.suggest_schema, args.generate_descriptor, args.parse_only]) and not args.descriptor:
-        # Full pipeline mode without descriptor - schema_name should be provided
-        if not args.schema_name:
-            print("Error: --descriptor or --schema-name is required for full import", file=sys.stderr)
-            sys.exit(1)
+    # Require explicit mode selection - no implicit behavior
+    if not any([args.suggest_schema, args.generate_descriptor, args.parse_only, args.auto]):
+        print("Error: Must specify an operation mode", file=sys.stderr)
+        print("Available modes:", file=sys.stderr)
+        print("  --auto                 : Discover schema + generate descriptor + import", file=sys.stderr)
+        print("  --suggest-schema       : Analyze data and suggest schemas", file=sys.stderr)
+        print("  --generate-descriptor  : Generate descriptor from data", file=sys.stderr)
+        print("  --parse-only          : Parse data without importing", file=sys.stderr)
+        sys.exit(1)
     try:
         load_structured_data(
@@ -1052,6 +964,7 @@ For more information on the descriptor format, see:
             suggest_schema=args.suggest_schema,
             generate_descriptor=args.generate_descriptor,
             parse_only=args.parse_only,
+            auto=args.auto,
             output_file=args.output,
             sample_size=args.sample_size,
             sample_chars=args.sample_chars,

trustgraph_cli-1.3.13/trustgraph/cli_version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "1.3.13"

{trustgraph_cli-1.3.12 → trustgraph_cli-1.3.13}/trustgraph_cli.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: trustgraph-cli
-Version: 1.3.12
+Version: 1.3.13
 Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
 Author-email: "trustgraph.ai" <security@trustgraph.ai>
 Project-URL: Homepage, https://github.com/trustgraph-ai/trustgraph