PyPI - rust-crate-pipeline - Versions diffs - 1.2.6__py3-none-any.whl → 1.5.1__py3-none-any.whl - Mend

rust-crate-pipeline 1.2.6py3-none-any.whl → 1.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

rust_crate_pipeline/__init__.py +15 -6
rust_crate_pipeline/ai_processing.py +260 -153
rust_crate_pipeline/analysis.py +171 -160
rust_crate_pipeline/config.py +23 -3
rust_crate_pipeline/github_token_checker.py +30 -20
rust_crate_pipeline/main.py +107 -45
rust_crate_pipeline/network.py +109 -108
rust_crate_pipeline/pipeline.py +269 -125
rust_crate_pipeline/production_config.py +15 -9
rust_crate_pipeline/utils/file_utils.py +14 -10
rust_crate_pipeline/utils/logging_utils.py +25 -13
rust_crate_pipeline/version.py +47 -2
{rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/METADATA +94 -9
rust_crate_pipeline-1.5.1.dist-info/RECORD +19 -0
rust_crate_pipeline-1.2.6.dist-info/RECORD +0 -19
{rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/WHEEL +0 -0
{rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/entry_points.txt +0 -0
{rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/licenses/LICENSE +0 -0
{rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/top_level.txt +0 -0

rust_crate_pipeline/analysis.py CHANGED Viewed

@@ -1,19 +1,42 @@
 # analysis.py
 import os
+import sys
 import re
 import io
-import json
 import time
 import tarfile
-import tempfile
 import subprocess
 import requests
-from datetime import datetime
-from dateutil.relativedelta import relativedelta
 from bs4 import BeautifulSoup
-from typing import Dict, Optional, List
+# Import utilities with fallback
+try:
+    # Add the parent directory to the path to import utils
+    sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
+    from utils.rust_code_analyzer import RustCodeAnalyzer
+except ImportError:
+    # Fallback implementation for when utils are not available
+    class RustCodeAnalyzer:
+        def __init__(self, code_content):
+            self.code_content = code_content
+        def analyze(self):
+            return {
+                "functions": [],
+                "structs": [],
+                "enums": [],
+                "traits": [],
+                "complexity": 0,
+                "lines_of_code": len(self.code_content.split('\n'))
+            }
+from typing import Dict, List
 from .config import EnrichedCrate
+# Import atomic utilities for code reuse
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 class SourceAnalyzer:
     @staticmethod
     def analyze_crate_source(crate: EnrichedCrate) -> Dict:
@@ -21,18 +44,18 @@ class SourceAnalyzer:
         crate_name = crate.name
         version = crate.version
         repo_url = crate.repository
         # Method 1: Try to download from crates.io
         try:
             url = f"https://crates.io/api/v1/crates/{crate_name}/{version}/download"
             response = requests.get(url, stream=True)
             if response.ok:
                 # We got the tarball, analyze it
                 return SourceAnalyzer.analyze_crate_tarball(response.content)
         except Exception as e:
             print(f"Failed to download from crates.io: {str(e)}")
         # Method 2: Try GitHub if we have a GitHub URL
         if "github.com" in repo_url:
             try:
@@ -40,59 +63,52 @@ class SourceAnalyzer:
                 match = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url)
                 if match:
                     owner, repo_name = match.groups()
-                    repo_name = repo_name.split('.')[0]  # Remove .git extension
+                    repo_name = repo_name.split(
+                        '.')[0]  # Remove .git extension
                     # Try to download tarball from GitHub
                     github_url = f"https://api.github.com/repos/{owner}/{repo_name}/tarball"
                     response = requests.get(github_url)
                     if response.ok:
-                        return SourceAnalyzer.analyze_github_tarball(response.content)
+                        return SourceAnalyzer.analyze_github_tarball(
+                            response.content)
             except Exception as e:
                 print(f"Failed to analyze from GitHub: {str(e)}")
         # Method 3: Try lib.rs
         try:
-            # lib.rs doesn't have a direct download API, but redirects to crates.io or GitHub
+            # lib.rs doesn't have a direct download API, but redirects to crates.io or
+            # GitHub
             url = f"https://lib.rs/crates/{crate_name}"
             response = requests.get(url)
             if response.ok:
                 soup = BeautifulSoup(response.text, 'html.parser')
                 # Look for repository links
                 repo_links = soup.select('a[href*="github.com"]')
                 if repo_links:
                     repo_url = repo_links[0]['href']
                     # We found a GitHub link, now analyze it
-                    return SourceAnalyzer.analyze_crate_source_from_repo(crate_name, version, repo_url)
+                    return SourceAnalyzer.analyze_crate_source_from_repo(
+                        crate_name, version, repo_url)
         except Exception as e:
             print(f"Failed to analyze from lib.rs: {str(e)}")
         # If we get here, we failed to analyze from any source
         return {
             "error": "Could not analyze crate from any source",
             "attempted_sources": ["crates.io", "github", "lib.rs"],
             "file_count": 0,
             "loc": 0
-        }
+        }    @ staticmethod
-    @staticmethod
     def analyze_crate_tarball(content: bytes) -> Dict:
-        """Analyze a .crate tarball from crates.io"""
-        metrics = {
-            "file_count": 0,
-            "loc": 0,
-            "complexity": [],
-            "types": [],
-            "traits": [],
-            "functions": [],
-            "has_tests": False,
-            "has_examples": False,
-            "has_benchmarks": False
-        }
+        """Analyze a .crate tarball from crates.io - refactored to use atomic utilities"""
+        metrics = RustCodeAnalyzer.create_empty_metrics()
         try:
             # Open the tar file from the content
             tar_content = io.BytesIO(content)
@@ -100,14 +116,13 @@ class SourceAnalyzer:
                 # Get list of Rust files
                 rust_files = [f for f in tar.getnames() if f.endswith('.rs')]
                 metrics["file_count"] = len(rust_files)
-                # Check for test/example/bench directories
+                # Check for test/example/bench directories using atomic utility
                 all_files = tar.getnames()
-                metrics["has_tests"] = any('test' in f.lower() for f in all_files)
-                metrics["has_examples"] = any('example' in f.lower() for f in all_files)
-                metrics["has_benchmarks"] = any('bench' in f.lower() for f in all_files)
-                # Analyze each Rust file
+                structure = RustCodeAnalyzer.detect_project_structure(
+                    all_files)
+                # Analyze each Rust file using atomic utility
                 for filename in rust_files:
                     try:
                         member = tar.getmember(filename)
@@ -115,41 +130,25 @@ class SourceAnalyzer:
                             file_content = tar.extractfile(member)
                             if file_content:
                                 content_str = file_content.read().decode('utf-8', errors='ignore')
-                                # Count lines of code
-                                metrics["loc"] += len(content_str.splitlines())
-                                # Extract code elements
-                                fn_matches = re.findall(r'fn\s+([a-zA-Z0-9_]+)', content_str)
-                                struct_matches = re.findall(r'struct\s+([a-zA-Z0-9_]+)', content_str)
-                                trait_matches = re.findall(r'trait\s+([a-zA-Z0-9_]+)', content_str)
-                                metrics["functions"].extend(fn_matches)
-                                metrics["types"].extend(struct_matches)
-                                metrics["traits"].extend(trait_matches)
+                                # Use atomic content analysis
+                                content_analysis = RustCodeAnalyzer.analyze_rust_content(
+                                    content_str)
+                                metrics = RustCodeAnalyzer.aggregate_metrics(
+                                    metrics, content_analysis, structure)
                     except Exception as e:
                         print(f"Error analyzing file {filename}: {str(e)}")
         except Exception as e:
             metrics["error"] = str(e)
-        return metrics
-    @staticmethod
+        return metrics    @ staticmethod
     def analyze_github_tarball(content: bytes) -> Dict:
-        """Analyze a GitHub tarball (which has a different structure)"""
-        metrics = {
-            "file_count": 0,
-            "loc": 0,
-            "complexity": [],
-            "types": [],
-            "traits": [],
-            "functions": [],
-            "has_tests": False,
-            "has_examples": False,
-            "has_benchmarks": False
-        }
+        """Analyze a GitHub tarball - refactored to use atomic utilities"""
+        metrics = RustCodeAnalyzer.create_empty_metrics()
         try:
             # GitHub tarballs are typically gzipped tar files
             tar_content = io.BytesIO(content)
@@ -158,14 +157,14 @@ class SourceAnalyzer:
                 # So we need to handle the different structure
                 rust_files = [f for f in tar.getnames() if f.endswith('.rs')]
                 metrics["file_count"] = len(rust_files)
-                # Check for test/example/bench directories
+                # Check for test/example/bench directories using atomic utility
                 all_files = tar.getnames()
-                metrics["has_tests"] = any('test' in f.lower() for f in all_files)
-                metrics["has_examples"] = any('example' in f.lower() for f in all_files)
-                metrics["has_benchmarks"] = any('bench' in f.lower() for f in all_files)
-                # Analyze each Rust file (same as crate tarball)
+                structure = RustCodeAnalyzer.detect_project_structure(
+                    all_files)
+                # Analyze each Rust file using atomic utility (same as crate
+                # tarball)
                 for filename in rust_files:
                     try:
                         member = tar.getmember(filename)
@@ -173,97 +172,101 @@ class SourceAnalyzer:
                             file_content = tar.extractfile(member)
                             if file_content:
                                 content_str = file_content.read().decode('utf-8', errors='ignore')
-                                # Count lines of code
-                                metrics["loc"] += len(content_str.splitlines())
-                                # Extract code elements
-                                fn_matches = re.findall(r'fn\s+([a-zA-Z0-9_]+)', content_str)
-                                struct_matches = re.findall(r'struct\s+([a-zA-Z0-9_]+)', content_str)
-                                trait_matches = re.findall(r'trait\s+([a-zA-Z0-9_]+)', content_str)
-                                metrics["functions"].extend(fn_matches)
-                                metrics["types"].extend(struct_matches)
-                                metrics["traits"].extend(trait_matches)
+                                # Use atomic content analysis
+                                content_analysis = RustCodeAnalyzer.analyze_rust_content(
+                                    content_str)
+                                metrics = RustCodeAnalyzer.aggregate_metrics(
+                                    metrics, content_analysis, structure)
                     except Exception as e:
                         print(f"Error analyzing file {filename}: {str(e)}")
         except Exception as e:
             metrics["error"] = str(e)
-        return metrics
-    @staticmethod
+        return metrics    @ staticmethod
     def analyze_local_directory(directory: str) -> Dict:
-        """Analyze source code from a local directory"""
-        metrics = {
-            "file_count": 0,
-            "loc": 0,
-            "complexity": [],
-            "types": [],
-            "traits": [],
-            "functions": [],
-            "has_tests": False,
-            "has_examples": False,
-            "has_benchmarks": False
-        }
+        """Analyze source code from a local directory - refactored to use atomic utilities"""
+        metrics = RustCodeAnalyzer.create_empty_metrics()
         try:
             # Find all Rust files
             rust_files = []
             for root, _, files in os.walk(directory):
                 if "target" in root or ".git" in root:  # Skip build dirs and git
                     continue
-                rust_files.extend([os.path.join(root, f) for f in files if f.endswith(".rs")])
+                rust_files.extend([os.path.join(root, f)
+                                  for f in files if f.endswith(".rs")])
             metrics["file_count"] = len(rust_files)
-            # Check if the crate has tests/examples/benchmarks
-            metrics["has_tests"] = any(os.path.exists(os.path.join(directory, d))
-                                      for d in ["tests", "test"])
-            metrics["has_examples"] = os.path.exists(os.path.join(directory, "examples"))
-            metrics["has_benchmarks"] = os.path.exists(os.path.join(directory, "benches"))
-            # Analyze each Rust file
+            # Check if the crate has tests/examples/benchmarks using atomic
+            # utility
+            project_dirs = [
+                d for d in os.listdir(directory) if os.path.isdir(
+                    os.path.join(
+                        directory, d))]
+            structure = RustCodeAnalyzer.detect_project_structure(
+                project_dirs + ["tests", "examples", "benches"])
+            # Override with actual directory checks
+            structure["has_tests"] = any(
+                os.path.exists(
+                    os.path.join(
+                        directory,
+                        d)) for d in [
+                    "tests",
+                    "test"])
+            structure["has_examples"] = os.path.exists(
+                os.path.join(directory, "examples"))
+            structure["has_benchmarks"] = os.path.exists(
+                os.path.join(directory, "benches"))
+            # Analyze each Rust file using atomic utility
             for file_path in rust_files:
                 try:
                     with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                         content = f.read()
-                    # Count lines of code
-                    metrics["loc"] += len(content.splitlines())
-                    # Extract code elements
-                    fn_matches = re.findall(r'fn\s+([a-zA-Z0-9_]+)', content)
-                    struct_matches = re.findall(r'struct\s+([a-zA-Z0-9_]+)', content)
-                    trait_matches = re.findall(r'trait\s+([a-zA-Z0-9_]+)', content)
-                    metrics["functions"].extend(fn_matches)
-                    metrics["types"].extend(struct_matches)
-                    metrics["traits"].extend(trait_matches)
+                    # Use atomic content analysis
+                    content_analysis = RustCodeAnalyzer.analyze_rust_content(
+                        content)
+                    metrics = RustCodeAnalyzer.aggregate_metrics(
+                        metrics, content_analysis, structure)
                 except Exception as e:
                     print(f"Error analyzing file {file_path}: {str(e)}")
         except Exception as e:
             metrics["error"] = str(e)
         return metrics
     @staticmethod
-    def analyze_crate_source_from_repo(crate_name: str, version: str, repo_url: str) -> Dict:
+    def analyze_crate_source_from_repo(
+            crate_name: str,
+            version: str,
+            repo_url: str) -> Dict:
         """Clone and analyze a crate's source code from repository"""
         temp_dir = f"/tmp/rust_analysis/{crate_name}"
         os.makedirs(temp_dir, exist_ok=True)
         try:
             # Clone repository
             if not os.path.exists(f"{temp_dir}/.git"):
-                subprocess.run(["git", "clone", "--depth=1", repo_url, temp_dir],
-                              capture_output=True, text=True, check=True)
+                subprocess.run(["git",
+                                "clone",
+                                "--depth=1",
+                                repo_url,
+                                temp_dir],
+                               capture_output=True,
+                               text=True,
+                               check=True)
             return SourceAnalyzer.analyze_local_directory(temp_dir)
         except Exception as e:
             return {
                 "error": f"Failed to clone and analyze repository: {str(e)}",
@@ -272,9 +275,10 @@ class SourceAnalyzer:
             }
         finally:
             # Clean up (optional)
-            # subprocess.run(["rm", "-rf", temp_dir], capture_output=True)
+            # subprocess.run(["rm", "-r", temp_dir], capture_output=True)
             pass
 class SecurityAnalyzer:
     @staticmethod
     def check_security_metrics(crate: EnrichedCrate) -> Dict:
@@ -286,10 +290,10 @@ class SecurityAnalyzer:
             "clippy_warnings": 0,
             "test_coverage": None
         }
         crate_name = crate.name
         version = crate.version
         # Check RustSec Advisory Database
         try:
             # This would require the RustSec advisory database
@@ -302,7 +306,7 @@ class SecurityAnalyzer:
                 security_data["vulnerability_count"] = len(advisories)
         except Exception:
             pass
         # Check for common security patterns in code
         try:
             # This would analyze the source code for unsafe blocks, etc.
@@ -311,9 +315,10 @@ class SecurityAnalyzer:
             security_data["security_patterns"] = []
         except Exception:
             pass
         return security_data
 class UserBehaviorAnalyzer:
     @staticmethod
     def fetch_user_behavior_data(crate: EnrichedCrate) -> Dict:
@@ -324,24 +329,25 @@ class UserBehaviorAnalyzer:
             "version_adoption": {},
             "community_metrics": {}
         }
         crate_name = crate.name
         repo_url = crate.repository
         # Extract owner/repo from URL
         if not repo_url or "github.com" not in repo_url:
             return result
         parts = repo_url.rstrip('/').split('/')
         if len(parts) < 2:
             return result
         owner, repo = parts[-2], parts[-1]
         # Setup GitHub API access - use token if available
         headers = {"Accept": "application/vnd.github.v3+json"}
         if os.environ.get("GITHUB_TOKEN"):
-            headers["Authorization"] = f"token {os.environ.get('GITHUB_TOKEN')}"
+            headers["Authorization"] = f"token {
+                os.environ.get('GITHUB_TOKEN')}"
         # Fetch recent issues and PRs
         try:
             # Get issues (last 30)
@@ -349,7 +355,7 @@ class UserBehaviorAnalyzer:
             issues_resp = requests.get(issues_url, headers=headers)
             if issues_resp.ok:
                 issues_data = issues_resp.json()
                 # Process issue data
                 for issue in issues_data:
                     if "pull_request" in issue:
@@ -372,18 +378,18 @@ class UserBehaviorAnalyzer:
                             "closed_at": issue["closed_at"],
                             "url": issue["html_url"]
                         })
             # Fetch commit activity for the past year
             commits_url = f"https://api.github.com/repos/{owner}/{repo}/stats/commit_activity"
             commits_resp = requests.get(commits_url, headers=headers)
             if commits_resp.ok:
                 result["community_metrics"]["commit_activity"] = commits_resp.json()
             # Rate limiting - be nice to GitHub API
             time.sleep(1)
         except Exception as e:
             print(f"Error fetching GitHub data: {str(e)}")
         # Get version adoption data from crates.io
         try:
             versions_url = f"https://crates.io/api/v1/crates/{crate_name}/versions"
@@ -391,36 +397,37 @@ class UserBehaviorAnalyzer:
             if versions_resp.ok:
                 versions_data = versions_resp.json()
                 versions = versions_data.get("versions", [])
                 # Process version data
                 for version in versions[:10]:  # Top 10 versions
                     version_num = version["num"]
                     downloads = version["downloads"]
                     created_at = version["created_at"]
                     result["version_adoption"][version_num] = {
                         "downloads": downloads,
                         "created_at": created_at
                     }
         except Exception as e:
             print(f"Error fetching crates.io version data: {str(e)}")
         return result
 class DependencyAnalyzer:
     @staticmethod
     def analyze_dependencies(crates: List[EnrichedCrate]) -> Dict:
         """Analyze dependencies between crates"""
         dependency_graph = {}
         crate_names = {crate.name for crate in crates}
         for crate in crates:
             deps = []
             for dep in crate.dependencies:
                 if dep.get("crate_id") in crate_names:
                     deps.append(dep.get("crate_id"))
             dependency_graph[crate.name] = deps
         # Find most depended-upon crates
         reverse_deps = {}
         for crate_name, deps in dependency_graph.items():
@@ -428,9 +435,13 @@ class DependencyAnalyzer:
                 if dep not in reverse_deps:
                     reverse_deps[dep] = []
                 reverse_deps[dep].append(crate_name)
         return {
             "dependency_graph": dependency_graph,
             "reverse_dependencies": reverse_deps,
-            "most_depended": sorted(reverse_deps.items(), key=lambda x: len(x[1]), reverse=True)[:10]
-        }
+            "most_depended": sorted(
+                reverse_deps.items(),
+                key=lambda x: len(
+                    x[1]),
+                reverse=True)[
+                :10]}

rust_crate_pipeline/config.py CHANGED Viewed

@@ -1,11 +1,21 @@
 # config.py
 import os
+import warnings
 from dataclasses import dataclass, field
 from typing import Optional, Dict, Any, List
+# Filter Pydantic deprecation warnings from dependencies
+# Rule Zero Compliance: Suppress third-party warnings while maintaining awareness
+warnings.filterwarnings("ignore",
+                       message=".*Support for class-based `config` is deprecated.*",
+                       category=DeprecationWarning,
+                       module="pydantic._internal._config")
 @dataclass
 class PipelineConfig:
-    model_path: str = os.path.expanduser("~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf")
+    model_path: str = os.path.expanduser(
+        "~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf")
     max_tokens: int = 256
     model_token_limit: int = 4096
     prompt_token_margin: int = 3000
@@ -14,7 +24,12 @@ class PipelineConfig:
     github_token: str = os.getenv("GITHUB_TOKEN", "")
     cache_ttl: int = 3600  # 1 hour
     batch_size: int = 10
-    n_workers: int = 4
+    n_workers: int = 4    # Enhanced scraping configuration
+    enable_crawl4ai: bool = True
+    crawl4ai_model: str = os.path.expanduser(
+        "~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf")
+    crawl4ai_timeout: int = 30
 @dataclass
 class CrateMetadata:
@@ -33,6 +48,11 @@ class CrateMetadata:
     readme_sections: Dict[str, str] = field(default_factory=dict)
     librs_downloads: Optional[int] = None
     source: str = "crates.io"
+    # Enhanced scraping fields
+    enhanced_scraping: Dict[str, Any] = field(default_factory=dict)
+    enhanced_features: List[str] = field(default_factory=list)
+    enhanced_dependencies: List[str] = field(default_factory=list)
 @dataclass
 class EnrichedCrate(CrateMetadata):
@@ -43,4 +63,4 @@ class EnrichedCrate(CrateMetadata):
     factual_counterfactual: Optional[str] = None
     source_analysis: Optional[Dict[str, Any]] = None
     user_behavior: Optional[Dict[str, Any]] = None
-    security: Optional[Dict[str, Any]] = None
+    security: Optional[Dict[str, Any]] = None

rust-crate-pipeline 1.2.6__py3-none-any.whl → 1.5.1__py3-none-any.whl

rust-crate-pipeline 1.2.6py3-none-any.whl → 1.5.1py3-none-any.whl