rust-crate-pipeline 1.3.4__tar.gz → 1.3.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. rust_crate_pipeline-1.3.6/CHANGELOG_v1.3.5.md +40 -0
  2. rust_crate_pipeline-1.3.6/CHANGELOG_v1.3.6.md +36 -0
  3. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/PKG-INFO +123 -24
  4. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/README.md +110 -9
  5. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/pyproject.toml +11 -13
  6. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/requirements.txt +1 -0
  7. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/analysis.py +95 -53
  8. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/network.py +2 -6
  9. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/pipeline.py +14 -21
  10. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/version.py +3 -1
  11. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline.egg-info/PKG-INFO +123 -24
  12. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline.egg-info/SOURCES.txt +3 -0
  13. rust_crate_pipeline-1.3.6/rust_crate_pipeline.egg-info/not-zip-safe +1 -0
  14. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline.egg-info/requires.txt +3 -0
  15. rust_crate_pipeline-1.3.6/setup.py +81 -0
  16. rust_crate_pipeline-1.3.4/setup.py +0 -77
  17. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/.aider.chat.history.md +0 -0
  18. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/CHANGELOG_v1.3.0.txt +0 -0
  19. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/CHANGELOG_v1.3.1.md +0 -0
  20. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/CHANGELOG_v1.3.2.md +0 -0
  21. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/CHANGELOG_v1.3.3.md +0 -0
  22. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/CHANGELOG_v1.3.4.md +0 -0
  23. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/CRAWL4AI_TYPE_ANALYSIS.md +0 -0
  24. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/LICENSE +0 -0
  25. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/MANIFEST.in +0 -0
  26. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/README_LLM_PROVIDERS.md +0 -0
  27. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/requirements-crawl4ai.txt +0 -0
  28. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/requirements-dev.txt +0 -0
  29. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/__init__.py +0 -0
  30. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/__main__.py +0 -0
  31. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/ai_processing.py +0 -0
  32. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/azure_ai_processing.py +0 -0
  33. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/config.py +0 -0
  34. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/core/__init__.py +0 -0
  35. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/core/canon_registry.py +0 -0
  36. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/core/irl_engine.py +0 -0
  37. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/core/sacred_chain.py +0 -0
  38. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/crate_analysis.py +0 -0
  39. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/crate_list.txt +0 -0
  40. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/github_token_checker.py +0 -0
  41. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/main.py +0 -0
  42. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/production_config.py +0 -0
  43. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/progress_monitor.py +0 -0
  44. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/scraping/__init__.py +0 -0
  45. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/scraping/unified_scraper.py +0 -0
  46. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/unified_llm_processor.py +0 -0
  47. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/unified_pipeline.py +0 -0
  48. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/utils/file_utils.py +0 -0
  49. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/utils/logging_utils.py +0 -0
  50. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline.egg-info/dependency_links.txt +0 -0
  51. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline.egg-info/entry_points.txt +0 -0
  52. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline.egg-info/top_level.txt +0 -0
  53. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/setup.cfg +0 -0
  54. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_build.py +0 -0
  55. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_config_coverage.py +0 -0
  56. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_crawl4ai_basic.py +0 -0
  57. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_crawl4ai_demo.py +0 -0
  58. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_crawl4ai_integration.py +0 -0
  59. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_crawl4ai_integration_fixed.py +0 -0
  60. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_github_token_checker_coverage.py +0 -0
  61. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_logging.py +0 -0
  62. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_main_integration.py +0 -0
  63. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_main_module_coverage.py +0 -0
  64. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_optimization_validation.py +0 -0
  65. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_rule_zero_lookup.py +0 -0
  66. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_rust_analyzer_coverage.py +0 -0
  67. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_sigil_unified.py +0 -0
  68. {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_thread_free.py +0 -0
@@ -0,0 +1,40 @@
1
+ # Changelog for rust-crate-pipeline v1.3.5
2
+
3
+ ## [1.3.5] - 2025-06-27
4
+
5
+ ### Fixed
6
+ - **Enhanced Scraping Integration**: Fixed import errors that prevented enhanced scraping from working
7
+ - Corrected import path from non-existent `enhanced_scraping` module to proper `UnifiedScraper` from `scraping` module
8
+ - Updated method calls to use correct `scrape_crate_documentation()` API
9
+ - Fixed initialization of enhanced scraper in pipeline
10
+ - **Dependency Management**: Added proper Crawl4AI and Playwright support
11
+ - Installed `crawl4ai>=0.6.0` for advanced web scraping capabilities
12
+ - Installed `playwright>=1.49.0` browsers for headless web scraping
13
+ - Added browser installation automation
14
+ - **PEP8 Compliance**: Improved cross-platform compatibility
15
+ - Replaced Unicode symbols with ASCII equivalents in logging messages
16
+ - Enhanced encoding support for better Windows/Linux compatibility
17
+ - Standardized logging format across all modules
18
+
19
+ ### Added
20
+ - **Enhanced Scraping Features**: Full Crawl4AI integration now available
21
+ - Multi-source scraping: crates.io, docs.rs, lib.rs
22
+ - Structured data extraction with quality scoring
23
+ - LLM-powered content analysis when configured
24
+ - Fallback support for basic scraping mode
25
+ - **Improved Error Handling**: Better graceful degradation when enhanced scraping is unavailable
26
+ - **Enhanced Logging**: More informative status messages with consistent formatting
27
+
28
+ ### Technical Improvements
29
+ - **Import Structure**: Cleaner module imports following PEP8 guidelines
30
+ - **Configuration**: Better handling of optional dependencies
31
+ - **Testing**: Enhanced scraping functionality now properly tested and validated
32
+
33
+ ### Dependencies
34
+ - Added: `crawl4ai>=0.6.0`
35
+ - Added: `playwright>=1.49.0`
36
+ - Updated: All existing dependencies to latest compatible versions
37
+
38
+ ---
39
+
40
+ **Note**: This release fully resolves the "Enhanced Scraping not Available" issue and provides a robust web scraping foundation for the pipeline.
@@ -0,0 +1,36 @@
1
+ # Changelog v1.3.6
2
+
3
+ ## [1.3.6] - 2025-01-21
4
+
5
+ ### Changed
6
+ - **BREAKING**: Updated Python version requirement from 3.9+ to 3.12+
7
+ - Updated all type annotations to use modern syntax (dict[str, Any] instead of Dict[str, Any])
8
+ - Removed support for Python 3.8, 3.9, 3.10, and 3.11
9
+ - Updated classifiers in pyproject.toml to reflect new Python version support
10
+
11
+ ### Technical Improvements
12
+ - Leveraged Python 3.12+ features for better type safety and performance
13
+ - Simplified type annotations throughout the codebase
14
+ - Improved compatibility with modern Python tooling and linters
15
+ - Enhanced code readability with modern Python syntax
16
+ - Added `from __future__ import annotations` to enable lazy type evaluation
17
+
18
+ ### Documentation
19
+ - Updated README.md to clearly specify Python 3.12+ requirement
20
+ - Added requirements section with detailed system dependencies
21
+ - Updated installation instructions to reflect new version requirements
22
+
23
+ ### Build System
24
+ - Updated pyproject.toml with new Python version constraint
25
+ - Updated setup.py to match pyproject.toml requirements
26
+ - Improved build process compatibility with modern Python versions
27
+
28
+ ### Compatibility
29
+ - This version is **not backward compatible** with Python versions below 3.12
30
+ - Users must upgrade to Python 3.12 or higher to use this version
31
+ - All modern type annotations now use the simplified syntax introduced in Python 3.9+
32
+
33
+ ### Migration Notes
34
+ - If you're currently using Python 3.11 or earlier, you'll need to upgrade to Python 3.12+
35
+ - No code changes are required for existing users, only Python version upgrade
36
+ - All existing functionality remains the same with improved type safety
@@ -1,40 +1,38 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rust-crate-pipeline
3
- Version: 1.3.4
4
- Summary: A comprehensive system for gathering, enriching, and analyzing metadata for Rust crates using AI-powered insights
5
- Home-page: https://github.com/Superuser666-Sigil/SigilDERG-Data_Production
6
- Author: SuperUser666-Sigil
7
- Author-email: SuperUser666-Sigil <miragemodularframework@gmail.com>
8
- License-Expression: MIT
3
+ Version: 1.3.6
4
+ Summary: A comprehensive pipeline for analyzing Rust crates with AI enrichment and enhanced scraping
5
+ Home-page: https://github.com/SigilDERG/rust-crate-pipeline
6
+ Author: SigilDERG Team
7
+ Author-email: SigilDERG Team <sigilderg@example.com>
8
+ License: MIT
9
9
  Project-URL: Homepage, https://github.com/Superuser666-Sigil/SigilDERG-Data_Production
10
10
  Project-URL: Documentation, https://github.com/Superuser666-Sigil/SigilDERG-Data_Production#readme
11
11
  Project-URL: Repository, https://github.com/Superuser666-Sigil/SigilDERG-Data_Production
12
12
  Project-URL: Bug Tracker, https://github.com/Superuser666-Sigil/SigilDERG-Data_Production/issues
13
- Keywords: rust,crates,metadata,ai,analysis,pipeline,dependencies
13
+ Keywords: rust,crates,analysis,ai,pipeline,scraping
14
14
  Classifier: Development Status :: 4 - Beta
15
15
  Classifier: Intended Audience :: Developers
16
- Classifier: Operating System :: OS Independent
16
+ Classifier: License :: OSI Approved :: MIT License
17
17
  Classifier: Programming Language :: Python :: 3
18
- Classifier: Programming Language :: Python :: 3.8
19
- Classifier: Programming Language :: Python :: 3.9
20
- Classifier: Programming Language :: Python :: 3.10
21
- Classifier: Programming Language :: Python :: 3.11
22
18
  Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
23
20
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
- Classifier: Topic :: Software Development :: Build Tools
25
21
  Classifier: Topic :: Software Development :: Quality Assurance
26
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
27
- Requires-Python: >=3.8
22
+ Requires-Python: >=3.12
28
23
  Description-Content-Type: text/markdown
29
24
  License-File: LICENSE
30
25
  Requires-Dist: requests>=2.28.0
31
26
  Requires-Dist: requests-cache>=1.0.0
32
27
  Requires-Dist: beautifulsoup4>=4.11.0
28
+ Requires-Dist: crawl4ai>=0.6.0
29
+ Requires-Dist: playwright>=1.49.0
33
30
  Requires-Dist: tqdm>=4.64.0
34
31
  Requires-Dist: llama-cpp-python>=0.2.0
35
32
  Requires-Dist: tiktoken>=0.5.0
36
33
  Requires-Dist: psutil>=5.9.0
37
34
  Requires-Dist: python-dateutil>=2.8.0
35
+ Requires-Dist: litellm>=1.0.0
38
36
  Provides-Extra: dev
39
37
  Requires-Dist: pytest>=7.0.0; extra == "dev"
40
38
  Requires-Dist: black>=22.0.0; extra == "dev"
@@ -57,7 +55,7 @@ The Rust Crate Pipeline is designed to collect, process, and enrich metadata fro
57
55
 
58
56
  ## Features
59
57
 
60
- - **Web Scraping**: Automated collection of crate metadata from crates.io using Crawl4AI
58
+ - **Enhanced Web Scraping**: Automated collection of crate metadata from crates.io using Crawl4AI with Playwright
61
59
  - **AI Enrichment**: Local and Azure OpenAI-powered analysis of crate descriptions, features, and documentation
62
60
  - **Multi-Provider LLM Support**: Unified LLM processor supporting OpenAI, Azure OpenAI, Ollama, LM Studio, and LiteLLM
63
61
  - **Cargo Testing**: Automated cargo build, test, and audit execution for comprehensive crate analysis
@@ -66,6 +64,15 @@ The Rust Crate Pipeline is designed to collect, process, and enrich metadata fro
66
64
  - **Data Export**: Structured output in JSON format for further analysis
67
65
  - **RAG Cache**: Intelligent caching with Rule Zero policies and architectural patterns
68
66
  - **Docker Support**: Containerized deployment with optimized Docker configurations
67
+ - **Real-time Progress Monitoring**: CLI-based progress tracking with ASCII status indicators
68
+ - **Cross-platform Compatibility**: Full Unicode symbol replacement for better encoding support
69
+
70
+ ## Requirements
71
+
72
+ - **Python 3.12+**: Required for modern type annotations and language features
73
+ - **Git**: For cloning repositories during analysis
74
+ - **Cargo**: For Rust crate testing and analysis
75
+ - **Playwright**: Automatically installed for enhanced web scraping
69
76
 
70
77
  ## Installation
71
78
 
@@ -74,13 +81,22 @@ The Rust Crate Pipeline is designed to collect, process, and enrich metadata fro
74
81
  git clone https://github.com/Superuser666-Sigil/SigilDERG-Data_Production.git
75
82
  cd SigilDERG-Data_Production
76
83
 
77
- # Install in development mode
84
+ # Install in development mode (includes all dependencies)
78
85
  pip install -e .
79
86
 
80
- # Install additional dependencies for AI processing
81
- pip install -r requirements-crawl4ai.txt
87
+ # Install Playwright browsers for enhanced scraping
88
+ playwright install
82
89
  ```
83
90
 
91
+ ### Automatic Dependency Installation
92
+
93
+ The package automatically installs all required dependencies including:
94
+ - `crawl4ai` for web scraping
95
+ - `playwright` for enhanced browser automation
96
+ - `requests` for HTTP requests
97
+ - `aiohttp` for async operations
98
+ - And all other required packages
99
+
84
100
  ## Configuration
85
101
 
86
102
  ### Environment Variables
@@ -160,6 +176,27 @@ python -m rust_crate_pipeline --checkpoint-interval 5
160
176
 
161
177
  # Enable verbose logging
162
178
  python -m rust_crate_pipeline --log-level DEBUG
179
+
180
+ # Enable enhanced scraping with Playwright
181
+ python -m rust_crate_pipeline --enable-enhanced-scraping
182
+
183
+ # Set output directory for results
184
+ python -m rust_crate_pipeline --output-path ./results
185
+ ```
186
+
187
+ #### Enhanced Scraping
188
+
189
+ The pipeline now supports enhanced web scraping using Playwright for better data extraction:
190
+
191
+ ```bash
192
+ # Enable enhanced scraping (default)
193
+ python -m rust_crate_pipeline --enable-enhanced-scraping
194
+
195
+ # Use basic scraping only
196
+ python -m rust_crate_pipeline --disable-enhanced-scraping
197
+
198
+ # Configure scraping options
199
+ python -m rust_crate_pipeline --scraping-config '{"max_pages": 10, "concurrency": 3}'
163
200
  ```
164
201
 
165
202
  #### Multi-Provider LLM Support
@@ -280,6 +317,12 @@ clap
280
317
 
281
318
  ## Development
282
319
 
320
+ ### Prerequisites
321
+
322
+ - Python 3.12+ (required for modern type annotations)
323
+ - Git for version control
324
+ - Cargo for Rust crate testing
325
+
283
326
  ### Running Tests
284
327
 
285
328
  ```bash
@@ -291,6 +334,12 @@ pytest tests/test_main_integration.py
291
334
 
292
335
  # Run with coverage
293
336
  pytest --cov=rust_crate_pipeline tests/
337
+
338
+ # Run type checking
339
+ pyright rust_crate_pipeline/
340
+
341
+ # Run linting
342
+ flake8 rust_crate_pipeline/
294
343
  ```
295
344
 
296
345
  ### Code Quality
@@ -304,14 +353,64 @@ isort rust_crate_pipeline/
304
353
 
305
354
  # Type checking
306
355
  pyright rust_crate_pipeline/
356
+
357
+ # Lint code
358
+ flake8 rust_crate_pipeline/
307
359
  ```
308
360
 
309
- ## Requirements
361
+ ### Building and Publishing
362
+
363
+ ```bash
364
+ # Build package
365
+ python -m build
366
+
367
+ # Upload to PyPI (requires PYPI_API_TOKEN)
368
+ python -m twine upload dist/*
369
+
370
+ # Create release
371
+ python scripts/create_release.py
372
+ ```
373
+
374
+ ### Docker Development
375
+
376
+ ```bash
377
+ # Build Docker image
378
+ docker build -t rust-crate-pipeline .
379
+
380
+ # Run in Docker
381
+ docker run -it rust-crate-pipeline
382
+
383
+ # Run with volume mount for development
384
+ docker run -it -v $(pwd):/app rust-crate-pipeline
385
+ ```
310
386
 
311
- - Python 3.12+
312
- - Rust toolchain (for cargo testing)
313
- - Git (for GitHub API access)
314
- - Internet connection (for web scraping and API calls)
387
+ ## Recent Improvements
388
+
389
+ ### Version 1.3.6
390
+ - **Python 3.12+ Requirement**: Updated to use modern type annotations and language features
391
+ - **Type Safety**: Enhanced type annotations throughout the codebase with modern syntax
392
+ - **Build System**: Updated pyproject.toml and setup.py for better compatibility
393
+
394
+ ### Version 1.3.5
395
+ - **Enhanced Web Scraping**: Added Playwright-based scraping for better data extraction
396
+ - **Unicode Compatibility**: Replaced all Unicode symbols with ASCII equivalents for better cross-platform support
397
+ - **Automatic Dependencies**: All required packages are now automatically installed
398
+ - **Real-time Progress**: Added CLI-based progress monitoring with ASCII status indicators
399
+ - **Docker Optimization**: Updated Dockerfile to include Playwright browser installation
400
+
401
+ ### Version 1.3.4
402
+ - **PEP8 Compliance**: Fixed all Unicode emoji and symbols for better encoding support
403
+ - **Cross-platform Compatibility**: Improved compatibility across different operating systems
404
+ - **Type Safety**: Enhanced type annotations throughout the codebase
405
+
406
+ ### Version 1.3.3
407
+ - **Real-time Progress Monitoring**: Added CLI-only progress tracking feature
408
+ - **Enhanced Logging**: Improved status reporting and error handling
409
+
410
+ ### Version 1.3.2
411
+ - **Multi-Provider LLM Support**: Added support for OpenAI, Azure OpenAI, Ollama, LM Studio, and LiteLLM
412
+ - **Unified LLM Processor**: Centralized LLM processing with provider abstraction
413
+ - **Enhanced Error Handling**: Better error recovery and retry mechanisms
315
414
 
316
415
  ## License
317
416
 
@@ -8,7 +8,7 @@ The Rust Crate Pipeline is designed to collect, process, and enrich metadata fro
8
8
 
9
9
  ## Features
10
10
 
11
- - **Web Scraping**: Automated collection of crate metadata from crates.io using Crawl4AI
11
+ - **Enhanced Web Scraping**: Automated collection of crate metadata from crates.io using Crawl4AI with Playwright
12
12
  - **AI Enrichment**: Local and Azure OpenAI-powered analysis of crate descriptions, features, and documentation
13
13
  - **Multi-Provider LLM Support**: Unified LLM processor supporting OpenAI, Azure OpenAI, Ollama, LM Studio, and LiteLLM
14
14
  - **Cargo Testing**: Automated cargo build, test, and audit execution for comprehensive crate analysis
@@ -17,6 +17,15 @@ The Rust Crate Pipeline is designed to collect, process, and enrich metadata fro
17
17
  - **Data Export**: Structured output in JSON format for further analysis
18
18
  - **RAG Cache**: Intelligent caching with Rule Zero policies and architectural patterns
19
19
  - **Docker Support**: Containerized deployment with optimized Docker configurations
20
+ - **Real-time Progress Monitoring**: CLI-based progress tracking with ASCII status indicators
21
+ - **Cross-platform Compatibility**: Full Unicode symbol replacement for better encoding support
22
+
23
+ ## Requirements
24
+
25
+ - **Python 3.12+**: Required for modern type annotations and language features
26
+ - **Git**: For cloning repositories during analysis
27
+ - **Cargo**: For Rust crate testing and analysis
28
+ - **Playwright**: Automatically installed for enhanced web scraping
20
29
 
21
30
  ## Installation
22
31
 
@@ -25,13 +34,22 @@ The Rust Crate Pipeline is designed to collect, process, and enrich metadata fro
25
34
  git clone https://github.com/Superuser666-Sigil/SigilDERG-Data_Production.git
26
35
  cd SigilDERG-Data_Production
27
36
 
28
- # Install in development mode
37
+ # Install in development mode (includes all dependencies)
29
38
  pip install -e .
30
39
 
31
- # Install additional dependencies for AI processing
32
- pip install -r requirements-crawl4ai.txt
40
+ # Install Playwright browsers for enhanced scraping
41
+ playwright install
33
42
  ```
34
43
 
44
+ ### Automatic Dependency Installation
45
+
46
+ The package automatically installs all required dependencies including:
47
+ - `crawl4ai` for web scraping
48
+ - `playwright` for enhanced browser automation
49
+ - `requests` for HTTP requests
50
+ - `aiohttp` for async operations
51
+ - And all other required packages
52
+
35
53
  ## Configuration
36
54
 
37
55
  ### Environment Variables
@@ -111,6 +129,27 @@ python -m rust_crate_pipeline --checkpoint-interval 5
111
129
 
112
130
  # Enable verbose logging
113
131
  python -m rust_crate_pipeline --log-level DEBUG
132
+
133
+ # Enable enhanced scraping with Playwright
134
+ python -m rust_crate_pipeline --enable-enhanced-scraping
135
+
136
+ # Set output directory for results
137
+ python -m rust_crate_pipeline --output-path ./results
138
+ ```
139
+
140
+ #### Enhanced Scraping
141
+
142
+ The pipeline now supports enhanced web scraping using Playwright for better data extraction:
143
+
144
+ ```bash
145
+ # Enable enhanced scraping (default)
146
+ python -m rust_crate_pipeline --enable-enhanced-scraping
147
+
148
+ # Use basic scraping only
149
+ python -m rust_crate_pipeline --disable-enhanced-scraping
150
+
151
+ # Configure scraping options
152
+ python -m rust_crate_pipeline --scraping-config '{"max_pages": 10, "concurrency": 3}'
114
153
  ```
115
154
 
116
155
  #### Multi-Provider LLM Support
@@ -231,6 +270,12 @@ clap
231
270
 
232
271
  ## Development
233
272
 
273
+ ### Prerequisites
274
+
275
+ - Python 3.12+ (required for modern type annotations)
276
+ - Git for version control
277
+ - Cargo for Rust crate testing
278
+
234
279
  ### Running Tests
235
280
 
236
281
  ```bash
@@ -242,6 +287,12 @@ pytest tests/test_main_integration.py
242
287
 
243
288
  # Run with coverage
244
289
  pytest --cov=rust_crate_pipeline tests/
290
+
291
+ # Run type checking
292
+ pyright rust_crate_pipeline/
293
+
294
+ # Run linting
295
+ flake8 rust_crate_pipeline/
245
296
  ```
246
297
 
247
298
  ### Code Quality
@@ -255,14 +306,64 @@ isort rust_crate_pipeline/
255
306
 
256
307
  # Type checking
257
308
  pyright rust_crate_pipeline/
309
+
310
+ # Lint code
311
+ flake8 rust_crate_pipeline/
258
312
  ```
259
313
 
260
- ## Requirements
314
+ ### Building and Publishing
315
+
316
+ ```bash
317
+ # Build package
318
+ python -m build
319
+
320
+ # Upload to PyPI (requires PYPI_API_TOKEN)
321
+ python -m twine upload dist/*
322
+
323
+ # Create release
324
+ python scripts/create_release.py
325
+ ```
326
+
327
+ ### Docker Development
328
+
329
+ ```bash
330
+ # Build Docker image
331
+ docker build -t rust-crate-pipeline .
332
+
333
+ # Run in Docker
334
+ docker run -it rust-crate-pipeline
335
+
336
+ # Run with volume mount for development
337
+ docker run -it -v $(pwd):/app rust-crate-pipeline
338
+ ```
261
339
 
262
- - Python 3.12+
263
- - Rust toolchain (for cargo testing)
264
- - Git (for GitHub API access)
265
- - Internet connection (for web scraping and API calls)
340
+ ## Recent Improvements
341
+
342
+ ### Version 1.3.6
343
+ - **Python 3.12+ Requirement**: Updated to use modern type annotations and language features
344
+ - **Type Safety**: Enhanced type annotations throughout the codebase with modern syntax
345
+ - **Build System**: Updated pyproject.toml and setup.py for better compatibility
346
+
347
+ ### Version 1.3.5
348
+ - **Enhanced Web Scraping**: Added Playwright-based scraping for better data extraction
349
+ - **Unicode Compatibility**: Replaced all Unicode symbols with ASCII equivalents for better cross-platform support
350
+ - **Automatic Dependencies**: All required packages are now automatically installed
351
+ - **Real-time Progress**: Added CLI-based progress monitoring with ASCII status indicators
352
+ - **Docker Optimization**: Updated Dockerfile to include Playwright browser installation
353
+
354
+ ### Version 1.3.4
355
+ - **PEP8 Compliance**: Fixed all Unicode emoji and symbols for better encoding support
356
+ - **Cross-platform Compatibility**: Improved compatibility across different operating systems
357
+ - **Type Safety**: Enhanced type annotations throughout the codebase
358
+
359
+ ### Version 1.3.3
360
+ - **Real-time Progress Monitoring**: Added CLI-only progress tracking feature
361
+ - **Enhanced Logging**: Improved status reporting and error handling
362
+
363
+ ### Version 1.3.2
364
+ - **Multi-Provider LLM Support**: Added support for OpenAI, Azure OpenAI, Ollama, LM Studio, and LiteLLM
365
+ - **Unified LLM Processor**: Centralized LLM processing with provider abstraction
366
+ - **Enhanced Error Handling**: Better error recovery and retry mechanisms
266
367
 
267
368
  ## License
268
369
 
@@ -4,40 +4,38 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "rust-crate-pipeline"
7
- version = "1.3.4"
7
+ version = "1.3.6"
8
8
  authors = [
9
- {name = "SuperUser666-Sigil", email = "miragemodularframework@gmail.com"},
9
+ {name = "SigilDERG Team", email = "sigilderg@example.com"}
10
10
  ]
11
- description = "A comprehensive system for gathering, enriching, and analyzing metadata for Rust crates using AI-powered insights"
11
+ description = "A comprehensive pipeline for analyzing Rust crates with AI enrichment and enhanced scraping"
12
12
  readme = "README.md"
13
- license = "MIT"
14
- requires-python = ">=3.8"
13
+ license = {text = "MIT"}
14
+ requires-python = ">=3.12"
15
15
  classifiers = [
16
16
  "Development Status :: 4 - Beta",
17
17
  "Intended Audience :: Developers",
18
- "Operating System :: OS Independent",
18
+ "License :: OSI Approved :: MIT License",
19
19
  "Programming Language :: Python :: 3",
20
- "Programming Language :: Python :: 3.8",
21
- "Programming Language :: Python :: 3.9",
22
- "Programming Language :: Python :: 3.10",
23
- "Programming Language :: Python :: 3.11",
24
20
  "Programming Language :: Python :: 3.12",
21
+ "Programming Language :: Python :: 3.13",
25
22
  "Topic :: Software Development :: Libraries :: Python Modules",
26
- "Topic :: Software Development :: Build Tools",
27
23
  "Topic :: Software Development :: Quality Assurance",
28
- "Topic :: Scientific/Engineering :: Artificial Intelligence",
29
24
  ]
30
- keywords = ["rust", "crates", "metadata", "ai", "analysis", "pipeline", "dependencies"]
25
+ keywords = ["rust", "crates", "analysis", "ai", "pipeline", "scraping"]
31
26
 
32
27
  dependencies = [
33
28
  "requests>=2.28.0",
34
29
  "requests-cache>=1.0.0",
35
30
  "beautifulsoup4>=4.11.0",
31
+ "crawl4ai>=0.6.0",
32
+ "playwright>=1.49.0",
36
33
  "tqdm>=4.64.0",
37
34
  "llama-cpp-python>=0.2.0",
38
35
  "tiktoken>=0.5.0",
39
36
  "psutil>=5.9.0",
40
37
  "python-dateutil>=2.8.0",
38
+ "litellm>=1.0.0",
41
39
  ]
42
40
 
43
41
  [project.optional-dependencies]
@@ -4,6 +4,7 @@ requests-cache>=1.0.0
4
4
  beautifulsoup4>=4.11.0
5
5
  # Enhanced web scraping with AI-powered extraction
6
6
  crawl4ai>=0.6.0
7
+ playwright>=1.49.0
7
8
  tqdm>=4.64.0
8
9
  llama-cpp-python>=0.2.0
9
10
  tiktoken>=0.5.0