PyPI - rust-crate-pipeline - Versions diffs - 1.2.5__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

rust-crate-pipeline 1.2.5py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

rust_crate_pipeline/__init__.py +25 -25
rust_crate_pipeline/__main__.py +1 -0
rust_crate_pipeline/ai_processing.py +309 -200
rust_crate_pipeline/analysis.py +304 -368
rust_crate_pipeline/azure_ai_processing.py +453 -0
rust_crate_pipeline/config.py +57 -19
rust_crate_pipeline/core/__init__.py +19 -0
rust_crate_pipeline/core/canon_registry.py +133 -0
rust_crate_pipeline/core/irl_engine.py +256 -0
rust_crate_pipeline/core/sacred_chain.py +117 -0
rust_crate_pipeline/crate_analysis.py +54 -0
rust_crate_pipeline/crate_list.txt +424 -0
rust_crate_pipeline/github_token_checker.py +42 -36
rust_crate_pipeline/main.py +386 -102
rust_crate_pipeline/network.py +153 -133
rust_crate_pipeline/pipeline.py +340 -264
rust_crate_pipeline/production_config.py +35 -32
rust_crate_pipeline/scraping/__init__.py +13 -0
rust_crate_pipeline/scraping/unified_scraper.py +259 -0
rust_crate_pipeline/unified_llm_processor.py +637 -0
rust_crate_pipeline/unified_pipeline.py +548 -0
rust_crate_pipeline/utils/file_utils.py +45 -14
rust_crate_pipeline/utils/logging_utils.py +34 -17
rust_crate_pipeline/version.py +47 -2
rust_crate_pipeline-1.3.0.dist-info/METADATA +331 -0
rust_crate_pipeline-1.3.0.dist-info/RECORD +30 -0
rust_crate_pipeline-1.2.5.dist-info/METADATA +0 -573
rust_crate_pipeline-1.2.5.dist-info/RECORD +0 -19
{rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.3.0.dist-info}/WHEEL +0 -0
{rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.3.0.dist-info}/entry_points.txt +0 -0
{rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.3.0.dist-info}/licenses/LICENSE +0 -0
{rust_crate_pipeline-1.2.5.dist-info → rust_crate_pipeline-1.3.0.dist-info}/top_level.txt +0 -0

rust_crate_pipeline/network.py CHANGED Viewed

@@ -1,44 +1,57 @@
 # network.py
 import os
+import sys
 import re
 import time
 import logging
 import requests
-from requests_cache import CachedSession
-from bs4 import BeautifulSoup
-from typing import Dict, List, Optional
+from bs4 import BeautifulSoup, Tag
+from typing import Any, Union
 from .config import PipelineConfig
+# Import utilities
+# Add the parent directory to the path to import utils
+sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
 class GitHubBatchClient:
-    def __init__(self, config: PipelineConfig):
+    def __init__(self, config: PipelineConfig) -> None:
         self.config = config
-        self.headers = {"Accept": "application/vnd.github.v3+json"}
+        # Simple headers without dependency on HTTPClientUtils
+        self.headers = {
+            "Accept": "application/vnd.github.v3+json",
+            "User-Agent": "SigilDERG-Data-Production/1.0",
+        }
         if config.github_token:
             self.headers["Authorization"] = f"token {config.github_token}"
-        self.session = CachedSession(
-            'github_cache',
-            expire_after=config.cache_ttl * 2  # Longer cache for GitHub
-        )
+        # Simple session without dependency on HTTPClientUtils
+        self.session = requests.Session()
+        self.session.headers.update(self.headers)
         self.remaining_calls = 5000
         self.reset_time = 0
-    def check_rate_limit(self):
+    def check_rate_limit(self) -> None:
         """Check and update current rate limit status"""
         try:
-            response = self.session.get("https://api.github.com/rate_limit", headers=self.headers)
+            response = self.session.get(
+                "https://api.github.com/rate_limit", headers=self.headers
+            )
             if response.ok:
                 data = response.json()
                 self.remaining_calls = data["resources"]["core"]["remaining"]
                 self.reset_time = data["resources"]["core"]["reset"]
                 if self.remaining_calls < 100:
                     reset_in = self.reset_time - time.time()
-                    logging.warning(f"GitHub API rate limit low: {self.remaining_calls} remaining. Resets in {reset_in/60:.1f} minutes")
+                    logging.warning(
+                        f"GitHub API rate limit low: {self.remaining_calls} remaining. "
+                        f"Resets in {reset_in / 60:.1f} minutes"
+                    )
         except Exception:
             pass
-    def get_repo_stats(self, owner: str, repo: str) -> Dict:
+    def get_repo_stats(self, owner: str, repo: str) -> dict[str, Any]:
         """Get repository statistics"""
         try:
             url = f"https://api.github.com/repos/{owner}/{repo}"
@@ -46,52 +59,62 @@ class GitHubBatchClient:
             if response.ok:
                 return response.json()
             else:
-                logging.warning(f"Failed to get repo stats for {owner}/{repo}: {response.status_code}")
+                logging.warning(
+                    f"Failed to get repo stats for {owner}/{repo}: "
+                    f"{response.status_code}"
+                )
                 return {}
         except Exception as e:
             logging.error(f"Error fetching repo stats: {str(e)}")
             return {}
-    def batch_get_repo_stats(self, repo_list: List[str]) -> Dict[str, Dict]:
+    def batch_get_repo_stats(self, repo_list: list[str]) -> dict[str, dict[str, Any]]:
         """Get statistics for multiple repositories in a batch"""
         self.check_rate_limit()
-        results = {}
+        results: dict[str, dict[str, Any]] = {}
         for repo_url in repo_list:
             # Extract owner/repo from URL
             match = re.search(r"github\.com/([^/]+)/([^/\.]+)", repo_url)
             if not match:
                 continue
             owner, repo = match.groups()
-            repo = repo.split('.')[0]  # Remove .git extension if present
+            repo = repo.split(".")[0]  # Remove .git extension if present
             # Get stats
             stats = self.get_repo_stats(owner, repo)
             results[repo_url] = stats
             # Be nice to GitHub API
             time.sleep(0.1)
         return results
 class CrateAPIClient:
-    def __init__(self, config: PipelineConfig):
+    def __init__(self, config: PipelineConfig) -> None:
         self.config = config
-        self.session = CachedSession('crate_cache', expire_after=config.cache_ttl)
-    def fetch_crate_metadata(self, crate_name: str) -> Optional[Dict]:
+        # Simple session without dependency on HTTPClientUtils
+        self.session = requests.Session()
+        self.session.headers.update({"User-Agent": "SigilDERG-Data-Production/1.0"})
+    def fetch_crate_metadata(self, crate_name: str) -> dict[str, Any] | None:
         """Fetch metadata with retry logic"""
         for attempt in range(self.config.max_retries):
             try:
                 return self._fetch_metadata(crate_name)
             except Exception as e:
-                logging.warning(f"Attempt {attempt+1} failed for {crate_name}: {str(e)}")
-                wait = 2 ** attempt
+                logging.warning(
+                    f"Attempt {
+                        attempt +
+                        1} failed for {crate_name}: {
+                        str(e)}"
+                )
+                wait = 2**attempt
                 time.sleep(wait)
         return None
-    def _fetch_metadata(self, crate_name: str) -> Optional[Dict]:
+    def _fetch_metadata(self, crate_name: str) -> dict[str, Any] | None:
         """Enhanced metadata fetching that tries multiple sources"""
         # First try crates.io (primary source)
         try:
@@ -100,67 +123,92 @@ class CrateAPIClient:
                 data = r.json()
                 crate_data = data["crate"]
                 latest = crate_data["newest_version"]
                 # Get readme
-                readme_response = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}/readme")
+                readme_response = self.session.get(
+                    f"https://crates.io/api/v1/crates/{crate_name}/readme"
+                )
                 readme = readme_response.text if readme_response.ok else ""
                 # Get dependencies
-                deps_response = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}/{latest}/dependencies")
-                deps = deps_response.json().get("dependencies", []) if deps_response.ok else []
+                deps_url = (
+                    f"https://crates.io/api/v1/crates/{crate_name}/"
+                    f"{latest}/dependencies"
+                )
+                deps_response = self.session.get(deps_url)
+                deps: list[dict[str, Any]] = (
+                    deps_response.json().get("dependencies", [])
+                    if deps_response.ok
+                    else []
+                )
                 # Get features - using the versions endpoint
                 features = []
-                versions_response = self.session.get(f"https://crates.io/api/v1/crates/{crate_name}/{latest}")
+                versions_response = self.session.get(
+                    f"https://crates.io/api/v1/crates/{crate_name}/{latest}"
+                )
                 if versions_response.ok:
                     version_data = versions_response.json().get("version", {})
                     features_dict = version_data.get("features", {})
-                    features = [{"name": k, "dependencies": v} for k, v in features_dict.items()]
+                    features = [
+                        {"name": k, "dependencies": v} for k, v in features_dict.items()
+                    ]
                 # Repository info and GitHub stars
                 repo = crate_data.get("repository", "")
                 gh_stars = 0
                 # Check if it's a GitHub repo
                 if "github.com" in repo and self.config.github_token:
                     match = re.search(r"github.com/([^/]+)/([^/]+)", repo)
                     if match:
                         owner, repo_name = match.groups()
-                        repo_name = repo_name.split('.')[0]  # Handle .git extensions
+                        repo_name = repo_name.split(".")[0]  # Handle .git extensions
                         gh_url = f"https://api.github.com/repos/{owner}/{repo_name}"
-                        gh_headers = {"Authorization": f"token {self.config.github_token}"} if self.config.github_token else {}
+                        gh_headers: dict[str, str] = {}
+                        if self.config.github_token:
+                            gh_headers["Authorization"] = (
+                                f"token {self.config.github_token}"
+                            )
                         gh = self.session.get(gh_url, headers=gh_headers)
                         if gh.ok:
                             gh_data = gh.json()
                             gh_stars = gh_data.get("stargazers_count", 0)
                 # Check if it's hosted on lib.rs
                 lib_rs_data = {}
                 if "lib.rs" in repo:
                     lib_rs_url = f"https://lib.rs/crates/{crate_name}"
                     lib_rs_response = self.session.get(lib_rs_url)
                     if lib_rs_response.ok:
-                        soup = BeautifulSoup(lib_rs_response.text, 'html.parser')
+                        soup = BeautifulSoup(lib_rs_response.text, "html.parser")
                         # Get README from lib.rs if not already available
                         if not readme:
-                            readme_div = soup.find('div', class_='readme')
+                            readme_div = soup.find("div", class_="readme")
                             if readme_div:
-                                readme = readme_div.get_text(strip=True)
-                        # Get lib.rs specific stats
-                        stats_div = soup.find('div', class_='crate-stats')
-                        if stats_div:
-                            downloads_text = stats_div.find(string=re.compile(r'[\d,]+ downloads'))
+                                readme = readme_div.get_text(
+                                    strip=True
+                                )  # Get lib.rs specific stats
+                        stats_div = soup.find("div", class_="crate-stats")
+                        if isinstance(stats_div, Tag):
+                            downloads_text = stats_div.find(
+                                string=re.compile(r"[\d,]+ downloads")
+                            )
                             if downloads_text:
-                                lib_rs_data["librs_downloads"] = int(re.sub(r'[^\d]', '', downloads_text))
-                # Extract code snippets from readme
-                code_snippets = self.extract_code_snippets(readme)
-                # Extract sections from readme
-                readme_sections = self.extract_readme_sections(readme) if readme else {}
-                result = {
+                                lib_rs_data["librs_downloads"] = int(
+                                    re.sub(r"[^\d]", "", str(downloads_text))
+                                )
+                # Extract code snippets and sections (simplified)
+                code_snippets: list[str] = (
+                    []
+                )  # Simplified - would normally extract from readme
+                readme_sections: dict[str, str] = (
+                    {}
+                )  # Simplified - would normally parse sections
+                result: dict[str, Any] = {
                     "name": crate_name,
                     "version": latest,
                     "description": crate_data.get("description", ""),
@@ -174,42 +222,55 @@ class CrateAPIClient:
                     "code_snippets": code_snippets,
                     "features": features,
                     "readme_sections": readme_sections,
-                    **lib_rs_data
+                    **lib_rs_data,
                 }
                 return result
         except Exception as e:
-            logging.error(f"Failed fetching metadata for {crate_name}: {str(e)}")
+            logging.error(
+                f"Failed fetching metadata for {crate_name}: {
+                    str(e)}"
+            )
             raise
         # If crates.io fails, try lib.rs
         try:
             r = self.session.get(f"https://lib.rs/crates/{crate_name}")
             if r.ok:
-                soup = BeautifulSoup(r.text, 'html.parser')
+                soup = BeautifulSoup(r.text, "html.parser")
                 # Extract metadata from lib.rs page
-                name = soup.select_one('h1').text.strip() if soup.select_one('h1') else crate_name
+                h1 = soup.select_one("h1")
+                name = h1.text.strip() if h1 else crate_name
                 # Find description
-                desc_elem = soup.select_one('.description')
+                desc_elem = soup.select_one(".description")
                 description = desc_elem.text.strip() if desc_elem else ""
                 # Find repository link
-                repo_link = None
-                for a in soup.select('a'):
-                    if 'github.com' in a.get('href', ''):
-                        repo_link = a['href']
+                repo_link: Union[str, None] = None
+                for a in soup.select("a"):
+                    href = a.get("href")
+                    if href and isinstance(href, str) and "github.com" in href:
+                        repo_link = href
                         break
+                # Find keywords
+                keywords_elem = soup.select_one(".keywords")
+                keywords = (
+                    [k.text.strip() for k in keywords_elem.find_all("a")]
+                    if keywords_elem
+                    else []
+                )
                 # Basic metadata from lib.rs
                 return {
                     "name": name,
                     "version": "latest",  # lib.rs doesn't easily expose version
                     "description": description,
                     "repository": repo_link or "",
-                    "keywords": [],
+                    "keywords": keywords,
                     "categories": [],
                     "readme": "",
                     "downloads": 0,
@@ -222,22 +283,26 @@ class CrateAPIClient:
                 }
         except Exception:
             pass
         # Finally, try GitHub search
         try:
-            # This is a simplification - GitHub's search API requires authentication
-            headers = {}
+            # This is a simplification - GitHub's search API requires
+            # authentication
+            gh_search_headers: dict[str, str] = {}
             if self.config.github_token:
-                headers["Authorization"] = f"token {self.config.github_token}"
-            search_url = f"https://api.github.com/search/repositories?q={crate_name}+language:rust"
-            r = requests.get(search_url, headers=headers)
+                gh_search_headers["Authorization"] = f"token {self.config.github_token}"
+            search_url = (
+                f"https://api.github.com/search/repositories?"
+                f"q={crate_name}+language:rust"
+            )
+            r = requests.get(search_url, headers=gh_search_headers)
             if r.ok:
                 results = r.json().get("items", [])
                 if results:
                     repo = results[0]  # Take first match
                     # Basic metadata from GitHub
                     return {
                         "name": crate_name,
@@ -257,51 +322,6 @@ class CrateAPIClient:
                     }
         except Exception:
             pass
         # If all sources fail
         return None
-    def extract_code_snippets(self, readme: str) -> List[str]:
-        """Extract code snippets from markdown README"""
-        snippets = []
-        if not readme:
-            return snippets
-        # Find Rust code blocks
-        pattern = r"```(?:rust|(?:no_run|ignore|compile_fail|mdbook-runnable)?)\s*([\s\S]*?)```"
-        matches = re.findall(pattern, readme)
-        for code in matches:
-            if len(code.strip()) > 10:  # Only include non-trivial snippets
-                snippets.append(code.strip())
-        return snippets[:5]  # Limit to 5 snippets
-    def extract_readme_sections(self, readme: str) -> Dict[str, str]:
-        """Extract sections from README based on markdown headers"""
-        if not readme:
-            return {}
-        sections = {}
-        lines = readme.split('\n')
-        current_section = ""
-        current_content = []
-        for line in lines:
-            if re.match(r'^#+\s+', line):  # It's a header
-                # Save previous section
-                if current_section and current_content:
-                    sections[current_section] = '\n'.join(current_content).strip()
-                # Start new section
-                current_section = re.sub(r'^#+\s+', '', line).strip()
-                current_content = []
-            else:
-                if current_section:  # Only collect content if we have a section
-                    current_content.append(line)
-        # Don't forget the last section
-        if current_section and current_content:
-            sections[current_section] = '\n'.join(current_content).strip()
-        return sections

rust-crate-pipeline 1.2.5__py3-none-any.whl → 1.3.0__py3-none-any.whl

rust-crate-pipeline 1.2.5py3-none-any.whl → 1.3.0py3-none-any.whl