PyPI - rust-crate-pipeline - Versions diffs - 1.2.6__py3-none-any.whl → 1.5.1__py3-none-any.whl - Mend

rust-crate-pipeline 1.2.6py3-none-any.whl → 1.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

rust_crate_pipeline/__init__.py +15 -6
rust_crate_pipeline/ai_processing.py +260 -153
rust_crate_pipeline/analysis.py +171 -160
rust_crate_pipeline/config.py +23 -3
rust_crate_pipeline/github_token_checker.py +30 -20
rust_crate_pipeline/main.py +107 -45
rust_crate_pipeline/network.py +109 -108
rust_crate_pipeline/pipeline.py +269 -125
rust_crate_pipeline/production_config.py +15 -9
rust_crate_pipeline/utils/file_utils.py +14 -10
rust_crate_pipeline/utils/logging_utils.py +25 -13
rust_crate_pipeline/version.py +47 -2
{rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/METADATA +94 -9
rust_crate_pipeline-1.5.1.dist-info/RECORD +19 -0
rust_crate_pipeline-1.2.6.dist-info/RECORD +0 -19
{rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/WHEEL +0 -0
{rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/entry_points.txt +0 -0
{rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/licenses/LICENSE +0 -0
{rust_crate_pipeline-1.2.6.dist-info → rust_crate_pipeline-1.5.1.dist-info}/top_level.txt +0 -0

rust_crate_pipeline/network.py CHANGED Viewed

@@ -1,40 +1,68 @@
 # network.py
 import os
+import sys
 import re
 import time
 import logging
 import requests
-from requests_cache import CachedSession
 from bs4 import BeautifulSoup
 from typing import Dict, List, Optional
 from .config import PipelineConfig
+# Import utilities with fallback
+try:
+    # Add the parent directory to the path to import utils
+    sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
+    from utils.http_client_utils import HTTPClientUtils, MetadataExtractor
+except ImportError:
+    # Fallback implementations for when utils are not available
+    class HTTPClientUtils:
+        def __init__(self):
+            pass
+    class MetadataExtractor:
+        def __init__(self):
+            pass
+# Import atomic utilities for code reuse
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 class GitHubBatchClient:
     def __init__(self, config: PipelineConfig):
         self.config = config
-        self.headers = {"Accept": "application/vnd.github.v3+json"}
+        # Simple headers without dependency on HTTPClientUtils
+        self.headers = {
+            "Accept": "application/vnd.github.v3+json",
+            "User-Agent": "SigilDERG-Data-Production/1.0"
+        }
         if config.github_token:
             self.headers["Authorization"] = f"token {config.github_token}"
-        self.session = CachedSession(
-            'github_cache',
-            expire_after=config.cache_ttl * 2  # Longer cache for GitHub
-        )
+        # Simple session without dependency on HTTPClientUtils
+        self.session = requests.Session()
+        self.session.headers.update(self.headers)
         self.remaining_calls = 5000
         self.reset_time = 0
     def check_rate_limit(self):
         """Check and update current rate limit status"""
         try:
-            response = self.session.get("https://api.github.com/rate_limit", headers=self.headers)
+            response = self.session.get(
+                "https://api.github.com/rate_limit",
+                headers=self.headers)
             if response.ok:
                 data = response.json()
                 self.remaining_calls = data["resources"]["core"]["remaining"]
                 self.reset_time = data["resources"]["core"]["reset"]
                 if self.remaining_calls < 100:
                     reset_in = self.reset_time - time.time()
-                    logging.warning(f"GitHub API rate limit low: {self.remaining_calls} remaining. Resets in {reset_in/60:.1f} minutes")
+                    logging.warning(
+                        f"GitHub API rate limit low: {
+                            self.remaining_calls} remaining. Resets in {
+                            reset_in / 60:.1f} minutes")
         except Exception:
             pass
@@ -46,7 +74,8 @@ class GitHubBatchClient:
             if response.ok:
                 return response.json()
             else:
-                logging.warning(f"Failed to get repo stats for {owner}/{repo}: {response.status_code}")
+                logging.warning(
+                    f"Failed to get repo stats for {owner}/{repo}: {response.status_code}")
                 return {}
         except Exception as e:
             logging.error(f"Error fetching repo stats: {str(e)}")
@@ -55,38 +84,46 @@ class GitHubBatchClient:
     def batch_get_repo_stats(self, repo_list: List[str]) -> Dict[str, Dict]:
         """Get statistics for multiple repositories in a batch"""
         self.check_rate_limit()
         results = {}
         for repo_url in repo_list:
             # Extract owner/repo from URL
             match = re.search(r"github\.com/([^/]+)/([^/\.]+)", repo_url)
             if not match:
                 continue
             owner, repo = match.groups()
             repo = repo.split('.')[0]  # Remove .git extension if present
             # Get stats
             stats = self.get_repo_stats(owner, repo)
             results[repo_url] = stats
             # Be nice to GitHub API
             time.sleep(0.1)
         return results
 class CrateAPIClient:
     def __init__(self, config: PipelineConfig):
         self.config = config
-        self.session = CachedSession('crate_cache', expire_after=config.cache_ttl)
+        # Simple session without dependency on HTTPClientUtils
+        self.session = requests.Session()
+        self.session.headers.update({
+            "User-Agent": "SigilDERG-Data-Production/1.0"
+        })
     def fetch_crate_metadata(self, crate_name: str) -> Optional[Dict]:
         """Fetch metadata with retry logic"""
         for attempt in range(self.config.max_retries):
             try:
                 return self._fetch_metadata(crate_name)
             except Exception as e:
-                logging.warning(f"Attempt {attempt+1} failed for {crate_name}: {str(e)}")
+                logging.warning(
+                    f"Attempt {
+                        attempt +
+                        1} failed for {crate_name}: {
+                        str(e)}")
                 wait = 2 ** attempt
                 time.sleep(wait)
         return None
@@ -95,71 +132,77 @@ class CrateAPIClient:
         """Enhanced metadata fetching that tries multiple sources"""
         # First try crates.io (primary source)
         try:
-            r = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}")
+            r = self.session.get(
+                f"https://crates.io/api/v1/crates/{crate_name}")
             if r.ok:
                 data = r.json()
                 crate_data = data["crate"]
                 latest = crate_data["newest_version"]
                 # Get readme
-                readme_response = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}/readme")
+                readme_response = self.session.get(
+                    f"https://crates.io/api/v1/crates/{crate_name}/readme")
                 readme = readme_response.text if readme_response.ok else ""
                 # Get dependencies
-                deps_response = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}/{latest}/dependencies")
+                deps_response = self.session.get(
+                    f"https://crates.io/api/v1/crates/{crate_name}/{latest}/dependencies")
                 deps = deps_response.json().get("dependencies", []) if deps_response.ok else []
                 # Get features - using the versions endpoint
                 features = []
-                versions_response = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}/{latest}")
+                versions_response = self.session.get(
+                    f"https://crates.io/api/v1/crates/{crate_name}/{latest}")
                 if versions_response.ok:
                     version_data = versions_response.json().get("version", {})
                     features_dict = version_data.get("features", {})
-                    features = [{"name": k, "dependencies": v} for k, v in features_dict.items()]
+                    features = [{"name": k, "dependencies": v}
+                                for k, v in features_dict.items()]
                 # Repository info and GitHub stars
                 repo = crate_data.get("repository", "")
                 gh_stars = 0
                 # Check if it's a GitHub repo
                 if "github.com" in repo and self.config.github_token:
                     match = re.search(r"github.com/([^/]+)/([^/]+)", repo)
                     if match:
                         owner, repo_name = match.groups()
-                        repo_name = repo_name.split('.')[0]  # Handle .git extensions
+                        repo_name = repo_name.split(
+                            '.')[0]  # Handle .git extensions
                         gh_url = f"https://api.github.com/repos/{owner}/{repo_name}"
-                        gh_headers = {"Authorization": f"token {self.config.github_token}"} if self.config.github_token else {}
+                        gh_headers = {
+                            "Authorization": f"token {
+                                self.config.github_token}"} if self.config.github_token else {}
                         gh = self.session.get(gh_url, headers=gh_headers)
                         if gh.ok:
                             gh_data = gh.json()
                             gh_stars = gh_data.get("stargazers_count", 0)
                 # Check if it's hosted on lib.rs
                 lib_rs_data = {}
                 if "lib.rs" in repo:
                     lib_rs_url = f"https://lib.rs/crates/{crate_name}"
                     lib_rs_response = self.session.get(lib_rs_url)
                     if lib_rs_response.ok:
-                        soup = BeautifulSoup(lib_rs_response.text, 'html.parser')
+                        soup = BeautifulSoup(
+                            lib_rs_response.text, 'html.parser')
                         # Get README from lib.rs if not already available
                         if not readme:
                             readme_div = soup.find('div', class_='readme')
                             if readme_div:
-                                readme = readme_div.get_text(strip=True)
-                        # Get lib.rs specific stats
+                                readme = readme_div.get_text(strip=True)                        # Get lib.rs specific stats
                         stats_div = soup.find('div', class_='crate-stats')
                         if stats_div:
-                            downloads_text = stats_div.find(string=re.compile(r'[\d,]+ downloads'))
-                            if downloads_text:
-                                lib_rs_data["librs_downloads"] = int(re.sub(r'[^\d]', '', downloads_text))
-                # Extract code snippets from readme
-                code_snippets = self.extract_code_snippets(readme)
-                # Extract sections from readme
-                readme_sections = self.extract_readme_sections(readme) if readme else {}
+                            downloads_text = stats_div.find(
+                                string=re.compile(r'[\d,]+ downloads'))
+                            if downloads_text:                                lib_rs_data["librs_downloads"] = int(
+                                    re.sub(r'[^\d]', '', str(downloads_text)))
+            # Extract code snippets and sections (simplified)
+                code_snippets = []  # Simplified - would normally extract from readme
+                readme_sections = {}  # Simplified - would normally parse sections
                 result = {
                     "name": crate_name,
                     "version": latest,
@@ -176,33 +219,36 @@ class CrateAPIClient:
                     "readme_sections": readme_sections,
                     **lib_rs_data
                 }
                 return result
         except Exception as e:
-            logging.error(f"Failed fetching metadata for {crate_name}: {str(e)}")
+            logging.error(
+                f"Failed fetching metadata for {crate_name}: {
+                    str(e)}")
             raise
         # If crates.io fails, try lib.rs
         try:
             r = self.session.get(f"https://lib.rs/crates/{crate_name}")
             if r.ok:
                 soup = BeautifulSoup(r.text, 'html.parser')
                 # Extract metadata from lib.rs page
-                name = soup.select_one('h1').text.strip() if soup.select_one('h1') else crate_name
+                name = soup.select_one('h1').text.strip(
+                ) if soup.select_one('h1') else crate_name
                 # Find description
                 desc_elem = soup.select_one('.description')
                 description = desc_elem.text.strip() if desc_elem else ""
                 # Find repository link
                 repo_link = None
                 for a in soup.select('a'):
-                    if 'github.com' in a.get('href', ''):
+                    if 'github.com' in a.get('hre', ''):
                         repo_link = a['href']
                         break
                 # Basic metadata from lib.rs
                 return {
                     "name": name,
@@ -222,22 +268,23 @@ class CrateAPIClient:
                 }
         except Exception:
             pass
         # Finally, try GitHub search
         try:
-            # This is a simplification - GitHub's search API requires authentication
+            # This is a simplification - GitHub's search API requires
+            # authentication
             headers = {}
             if self.config.github_token:
                 headers["Authorization"] = f"token {self.config.github_token}"
             search_url = f"https://api.github.com/search/repositories?q={crate_name}+language:rust"
             r = requests.get(search_url, headers=headers)
             if r.ok:
                 results = r.json().get("items", [])
                 if results:
                     repo = results[0]  # Take first match
                     # Basic metadata from GitHub
                     return {
                         "name": crate_name,
@@ -249,8 +296,7 @@ class CrateAPIClient:
                         "readme": "",
                         "downloads": 0,
                         "github_stars": repo.get("stargazers_count", 0),
-                        "dependencies": [],
-                        "code_snippets": [],
+                        "dependencies": [],                        "code_snippets": [],
                         "features": [],
                         "readme_sections": {},
                         "source": "github",
@@ -260,48 +306,3 @@ class CrateAPIClient:
         # If all sources fail
         return None
-    def extract_code_snippets(self, readme: str) -> List[str]:
-        """Extract code snippets from markdown README"""
-        snippets = []
-        if not readme:
-            return snippets
-        # Find Rust code blocks
-        pattern = r"```(?:rust|(?:no_run|ignore|compile_fail|mdbook-runnable)?)\s*([\s\S]*?)```"
-        matches = re.findall(pattern, readme)
-        for code in matches:
-            if len(code.strip()) > 10:  # Only include non-trivial snippets
-                snippets.append(code.strip())
-        return snippets[:5]  # Limit to 5 snippets
-    def extract_readme_sections(self, readme: str) -> Dict[str, str]:
-        """Extract sections from README based on markdown headers"""
-        if not readme:
-            return {}
-        sections = {}
-        lines = readme.split('\n')
-        current_section = ""
-        current_content = []
-        for line in lines:
-            if re.match(r'^#+\s+', line):  # It's a header
-                # Save previous section
-                if current_section and current_content:
-                    sections[current_section] = '\n'.join(current_content).strip()
-                # Start new section
-                current_section = re.sub(r'^#+\s+', '', line).strip()
-                current_content = []
-            else:
-                if current_section:  # Only collect content if we have a section
-                    current_content.append(line)
-        # Don't forget the last section
-        if current_section and current_content:
-            sections[current_section] = '\n'.join(current_content).strip()
-        return sections

rust-crate-pipeline 1.2.6__py3-none-any.whl → 1.5.1__py3-none-any.whl

rust-crate-pipeline 1.2.6py3-none-any.whl → 1.5.1py3-none-any.whl