PyPI - rust-crate-pipeline - Versions diffs - 1.2.6__py3-none-any.whl → 1.5.1__py3-none-any.whl - Mend

rust-crate-pipeline 1.2.6py3-none-any.whl → 1.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

rust_crate_pipeline/__init__.py +15 -6
rust_crate_pipeline/ai_processing.py +260 -153
rust_crate_pipeline/analysis.py +171 -160
rust_crate_pipeline/config.py +23 -3
rust_crate_pipeline/github_token_checker.py +30 -20
rust_crate_pipeline/main.py +107 -45
rust_crate_pipeline/network.py +109 -108
rust_crate_pipeline/pipeline.py +269 -125
rust_crate_pipeline/production_config.py +15 -9
rust_crate_pipeline/utils/file_utils.py +14 -10
rust_crate_pipeline/utils/logging_utils.py +25 -13
rust_crate_pipeline/version.py +47 -2
{rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/METADATA +94 -9
rust_crate_pipeline-1.5.1.dist-info/RECORD +19 -0
rust_crate_pipeline-1.2.6.dist-info/RECORD +0 -19
{rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/WHEEL +0 -0
{rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/entry_points.txt +0 -0
{rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/licenses/LICENSE +0 -0
{rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/top_level.txt +0 -0

rust_crate_pipeline/github_token_checker.py CHANGED Viewed

@@ -9,25 +9,29 @@ import sys
 import requests
 import logging
 def check_github_token_quick():
     """Quick check if GitHub token is available and valid"""
     token = os.getenv("GITHUB_TOKEN")
     if not token:
         return False, "GITHUB_TOKEN environment variable not set"
     if len(token) < 20:
         return False, "GITHUB_TOKEN seems too short - may be invalid"
     try:
         # Quick API check
         headers = {
             "Accept": "application/vnd.github.v3+json",
             "Authorization": f"token {token}"
         }
-        response = requests.get("https://api.github.com/rate_limit", headers=headers, timeout=10)
+        response = requests.get(
+            "https://api.github.com/rate_limit",
+            headers=headers,
+            timeout=10)
         if response.status_code == 200:
             data = response.json()
             remaining = data["resources"]["core"]["remaining"]
@@ -35,18 +39,20 @@ def check_github_token_quick():
         elif response.status_code == 401:
             return False, "GitHub token is invalid or expired"
         else:
-            return False, f"GitHub API returned status code: {response.status_code}"
+            return False, f"GitHub API returned status code: {
+                response.status_code}"
     except requests.exceptions.RequestException as e:
         return False, f"Network error checking token: {str(e)}"
     except Exception as e:
         return False, f"Error checking token: {str(e)}"
 def prompt_for_token_setup():
     """Prompt user to set up GitHub token"""
-    print("\n" + "="*60)
+    print("\n" + "=" * 60)
     print("🔑 GitHub Token Required")
-    print("="*60)
+    print("=" * 60)
     print("\nThe Rust Crate Pipeline requires a GitHub Personal Access Token")
     print("to access repository information and avoid rate limits.")
     print("\n📋 Quick Setup:")
@@ -57,11 +63,12 @@ def prompt_for_token_setup():
     print("\n🔧 Setup Scripts Available:")
     print("   ./setup_github_token.sh    (Interactive setup)")
     print("   python3 check_github_token.py    (Full verification)")
-    print("\n" + "="*60)
+    print("\n" + "=" * 60)
     # Ask if user wants to continue without token (limited functionality)
-    response = input("\nContinue without GitHub token? (y/N): ").strip().lower()
+    response = input(
+        "\nContinue without GitHub token? (y/N): ").strip().lower()
     if response in ['y', 'yes']:
         print("⚠️  Running with limited GitHub API access (60 requests/hour)")
         print("   You may encounter rate limit warnings.")
@@ -70,33 +77,36 @@ def prompt_for_token_setup():
         print("\n🛑 Please set up your GitHub token and try again.")
         return False
 def check_and_setup_github_token():
     """
     Check GitHub token and prompt for setup if missing.
     Returns True if should continue, False if should exit.
     """
     is_valid, message = check_github_token_quick()
     if is_valid:
         logging.debug(f"GitHub token check: {message}")
         return True
     # Token is missing or invalid
     logging.warning(f"GitHub token issue: {message}")
     # Check if we're in a non-interactive environment
     if not sys.stdin.isatty():
-        logging.error("GitHub token not configured and running in non-interactive mode")
+        logging.error(
+            "GitHub token not configured and running in non-interactive mode")
         logging.error("Set GITHUB_TOKEN environment variable before running")
         return False
     # Interactive prompt
     return prompt_for_token_setup()
 if __name__ == "__main__":
     # Allow running this module directly for testing
     is_valid, message = check_github_token_quick()
     print(f"Token check: {'✅' if is_valid else '❌'} {message}")
     if not is_valid:
         check_and_setup_github_token()

rust_crate_pipeline/main.py CHANGED Viewed

@@ -1,16 +1,15 @@
 # main.py
-import os
 import sys
 import time
 import logging
 import shutil
 import argparse
-from typing import Optional
 from .config import PipelineConfig
 from .pipeline import CrateDataPipeline
 from .production_config import setup_production_environment
 from .github_token_checker import check_and_setup_github_token
 def parse_arguments():
     """Parse command line arguments"""
     parser = argparse.ArgumentParser(
@@ -26,102 +25,134 @@ Examples:
   PRODUCTION=true python -m rust_crate_pipeline     # Production mode (quieter)
         """
     )
     parser.add_argument(
         '--limit', '-l',
         type=int,
         default=None,
         help='Limit the number of crates to process (default: process all)'
     )
     parser.add_argument(
         '--batch-size', '-b',
         type=int,
         default=10,
         help='Number of crates to process in each batch (default: 10)'
     )
     parser.add_argument(
         '--workers', '-w',
         type=int,
         default=4,
         help='Number of parallel workers for API requests (default: 4)'
     )
     parser.add_argument(
         '--output-dir', '-o',
         type=str,
         default=None,
         help='Output directory for results (default: auto-generated timestamped directory)'
     )
     parser.add_argument(
         '--model-path', '-m',
         type=str,
         default=None,
         help='Path to the LLM model file (default: ~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf)'
     )
     parser.add_argument(
         '--max-tokens',
         type=int,
         default=256,
         help='Maximum tokens for LLM generation (default: 256)'
     )
     parser.add_argument(
         '--checkpoint-interval',
         type=int,
         default=10,
         help='Save checkpoint every N crates (default: 10)'
     )
-    parser.add_argument(
-        '--log-level',
-        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
-        default='INFO',
-        help='Logging level (default: INFO)'
-    )
+    parser.add_argument('--log-level',
+                        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
+                        default='INFO',
+                        help='Logging level (default: INFO)'
+                        )
     parser.add_argument(
         '--skip-ai',
         action='store_true',
         help='Skip AI enrichment (faster, metadata only)'
     )
     parser.add_argument(
         '--skip-source-analysis',
         action='store_true',
         help='Skip source code analysis'
     )
+    # Enhanced scraping with Crawl4AI
+    parser.add_argument(
+        '--enable-crawl4ai',
+        action='store_true',
+        default=True,
+        help='Enable enhanced web scraping with Crawl4AI (default: enabled)'
+    )
+    parser.add_argument(
+        '--disable-crawl4ai',
+        action='store_true',
+        help='Disable Crawl4AI enhanced scraping (use basic scraping only)'    )
+    parser.add_argument(
+        '--crawl4ai-model',
+        type=str,
+        default='~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf',
+        help='GGUF model path for Crawl4AI content analysis (default: ~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf)'
+    )
+    parser.add_argument(
+        '--enable-sigil-protocol',
+        action='store_true',
+        help='Enable Sigil Protocol Sacred Chain processing (Rule Zero compliance)')
+    parser.add_argument(
+        '--sigil-mode',
+        choices=['enhanced', 'direct-llm', 'hybrid'],
+        default='enhanced',
+        help='Sigil processing mode: enhanced (API-based), direct-llm (local), hybrid (both)'
+    )
     parser.add_argument(
         '--crate-list',
         type=str,
         nargs='+',
         help='Specific crates to process (space-separated list)'
     )
     parser.add_argument(
         '--config-file',
         type=str,
         help='JSON config file to override default settings'
     )
     return parser.parse_args()
 def configure_logging(log_level: str = 'INFO'):
     """Configure logging with both console and file output"""
     level = getattr(logging, log_level.upper())
     # Clear any existing handlers to avoid conflicts
     root_logger = logging.getLogger()
     for handler in root_logger.handlers[:]:
         root_logger.removeHandler(handler)
     # Set root logger level
     root_logger.setLevel(level)
     # Create formatters
     detailed_formatter = logging.Formatter(
         "%(asctime)s [%(levelname)s] %(name)s: %(message)s",
@@ -130,55 +161,58 @@ def configure_logging(log_level: str = 'INFO'):
     simple_formatter = logging.Formatter(
         "%(asctime)s [%(levelname)s] %(message)s"
     )
     # Console handler
     console_handler = logging.StreamHandler()
     console_handler.setLevel(level)
     console_handler.setFormatter(simple_formatter)
     root_logger.addHandler(console_handler)
     # File handler with unique timestamp
     log_filename = f"crate_enrichment_{time.strftime('%Y%m%d-%H%M%S')}.log"
     try:
-        file_handler = logging.FileHandler(log_filename, mode='w', encoding='utf-8')
+        file_handler = logging.FileHandler(
+            log_filename, mode='w', encoding='utf-8')
         file_handler.setLevel(logging.DEBUG)  # Always capture DEBUG+ to file
         file_handler.setFormatter(detailed_formatter)
         root_logger.addHandler(file_handler)
         # Log a test message to verify file handler works
         logging.info(f"Logging initialized - file: {log_filename}")
     except Exception as e:
         logging.error(f"Failed to create log file {log_filename}: {e}")
         print(f"Warning: Could not create log file: {e}")
     # Set library loggers to less verbose levels
     logging.getLogger('requests').setLevel(logging.WARNING)
     logging.getLogger('urllib3').setLevel(logging.WARNING)
     logging.getLogger('requests_cache').setLevel(logging.WARNING)
     logging.getLogger('llama_cpp').setLevel(logging.WARNING)
 def check_disk_space():
     if shutil.disk_usage(".").free < 1_000_000_000:  # 1GB
         logging.warning("Low disk space! This may affect performance.")
 def main():
     # Setup production environment first for optimal logging
     prod_config = setup_production_environment()
     args = parse_arguments()
     configure_logging(args.log_level)
     check_disk_space()
     # Check GitHub token before proceeding
     if not check_and_setup_github_token():
         logging.error("GitHub token setup cancelled or failed. Exiting.")
         sys.exit(1)
     try:
         # Create config from command line arguments
         config_kwargs = {}
         # Apply production optimizations if available
         if prod_config:
             config_kwargs.update({
@@ -187,7 +221,7 @@ def main():
                 'checkpoint_interval': prod_config.get('checkpoint_interval', 10),
                 'cache_ttl': prod_config.get('cache_ttl', 3600),
             })
         if args.batch_size:
             config_kwargs['batch_size'] = args.batch_size
         if args.workers:
@@ -198,16 +232,23 @@ def main():
             config_kwargs['max_tokens'] = args.max_tokens
         if args.checkpoint_interval:
             config_kwargs['checkpoint_interval'] = args.checkpoint_interval
-        # Load config file if provided
+            # Load config file if provided
         if args.config_file:
             import json
             with open(args.config_file, 'r') as f:
                 file_config = json.load(f)
                 config_kwargs.update(file_config)
+        # Handle Crawl4AI configuration
+        enable_crawl4ai = args.enable_crawl4ai and not args.disable_crawl4ai if hasattr(
+            args, 'disable_crawl4ai') else True
+        config_kwargs.update({
+            'enable_crawl4ai': enable_crawl4ai,
+            'crawl4ai_model': getattr(args, 'crawl4ai_model', '~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf')
+        })
         config = PipelineConfig(**config_kwargs)
         # Pass additional arguments to pipeline
         pipeline_kwargs = {}
         if args.output_dir:
@@ -220,15 +261,36 @@ def main():
             pipeline_kwargs['skip_ai'] = True
         if args.skip_source_analysis:
             pipeline_kwargs['skip_source'] = True
-        pipeline = CrateDataPipeline(config, **pipeline_kwargs)
+        # Sigil Protocol integration
+        if hasattr(
+                args,
+                'enable_sigil_protocol') and args.enable_sigil_protocol:
+            # Import Sigil enhanced pipeline
+            try:
+                import sys
+                sys.path.append('.')  # Add current directory to path
+                from sigil_enhanced_pipeline import SigilCompliantPipeline
+                pipeline = SigilCompliantPipeline(config, **pipeline_kwargs)
+                logging.info(
+                    "Starting Sigil Protocol compliant pipeline with Sacred Chain processing")
+            except ImportError as e:
+                logging.warning(f"Sigil enhanced pipeline not available: {e}")
+                logging.info("Falling back to standard pipeline")
+                pipeline = CrateDataPipeline(config, **pipeline_kwargs)
+        else:
+            pipeline = CrateDataPipeline(config, **pipeline_kwargs)
         logging.info(f"Starting pipeline with {len(vars(args))} arguments")
-        pipeline.run()
+        # Run the pipeline asynchronously
+        import asyncio
+        asyncio.run(pipeline.run())
     except Exception as e:
         logging.critical(f"Pipeline failed: {str(e)}")
         sys.exit(1)
 if __name__ == "__main__":
-    main()
+    main()

rust-crate-pipeline 1.2.6__py3-none-any.whl → 1.5.1__py3-none-any.whl

rust-crate-pipeline 1.2.6py3-none-any.whl → 1.5.1py3-none-any.whl