rust-crate-pipeline 1.3.4__tar.gz → 1.3.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline-1.3.6/CHANGELOG_v1.3.5.md +40 -0
- rust_crate_pipeline-1.3.6/CHANGELOG_v1.3.6.md +36 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/PKG-INFO +123 -24
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/README.md +110 -9
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/pyproject.toml +11 -13
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/requirements.txt +1 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/analysis.py +95 -53
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/network.py +2 -6
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/pipeline.py +14 -21
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/version.py +3 -1
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline.egg-info/PKG-INFO +123 -24
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline.egg-info/SOURCES.txt +3 -0
- rust_crate_pipeline-1.3.6/rust_crate_pipeline.egg-info/not-zip-safe +1 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline.egg-info/requires.txt +3 -0
- rust_crate_pipeline-1.3.6/setup.py +81 -0
- rust_crate_pipeline-1.3.4/setup.py +0 -77
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/.aider.chat.history.md +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/CHANGELOG_v1.3.0.txt +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/CHANGELOG_v1.3.1.md +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/CHANGELOG_v1.3.2.md +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/CHANGELOG_v1.3.3.md +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/CHANGELOG_v1.3.4.md +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/CRAWL4AI_TYPE_ANALYSIS.md +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/LICENSE +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/MANIFEST.in +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/README_LLM_PROVIDERS.md +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/requirements-crawl4ai.txt +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/requirements-dev.txt +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/__init__.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/__main__.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/ai_processing.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/azure_ai_processing.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/config.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/core/__init__.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/core/canon_registry.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/core/irl_engine.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/core/sacred_chain.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/crate_analysis.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/crate_list.txt +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/github_token_checker.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/main.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/production_config.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/progress_monitor.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/scraping/__init__.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/scraping/unified_scraper.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/unified_llm_processor.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/unified_pipeline.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/utils/file_utils.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline/utils/logging_utils.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline.egg-info/dependency_links.txt +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline.egg-info/entry_points.txt +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/rust_crate_pipeline.egg-info/top_level.txt +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/setup.cfg +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_build.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_config_coverage.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_crawl4ai_basic.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_crawl4ai_demo.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_crawl4ai_integration.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_crawl4ai_integration_fixed.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_github_token_checker_coverage.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_logging.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_main_integration.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_main_module_coverage.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_optimization_validation.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_rule_zero_lookup.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_rust_analyzer_coverage.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_sigil_unified.py +0 -0
- {rust_crate_pipeline-1.3.4 → rust_crate_pipeline-1.3.6}/tests/test_thread_free.py +0 -0
@@ -0,0 +1,40 @@
|
|
1
|
+
# Changelog for rust-crate-pipeline v1.3.5
|
2
|
+
|
3
|
+
## [1.3.5] - 2025-06-27
|
4
|
+
|
5
|
+
### Fixed
|
6
|
+
- **Enhanced Scraping Integration**: Fixed import errors that prevented enhanced scraping from working
|
7
|
+
- Corrected import path from non-existent `enhanced_scraping` module to proper `UnifiedScraper` from `scraping` module
|
8
|
+
- Updated method calls to use correct `scrape_crate_documentation()` API
|
9
|
+
- Fixed initialization of enhanced scraper in pipeline
|
10
|
+
- **Dependency Management**: Added proper Crawl4AI and Playwright support
|
11
|
+
- Installed `crawl4ai>=0.6.0` for advanced web scraping capabilities
|
12
|
+
- Installed `playwright>=1.49.0` browsers for headless web scraping
|
13
|
+
- Added browser installation automation
|
14
|
+
- **PEP8 Compliance**: Improved cross-platform compatibility
|
15
|
+
- Replaced Unicode symbols with ASCII equivalents in logging messages
|
16
|
+
- Enhanced encoding support for better Windows/Linux compatibility
|
17
|
+
- Standardized logging format across all modules
|
18
|
+
|
19
|
+
### Added
|
20
|
+
- **Enhanced Scraping Features**: Full Crawl4AI integration now available
|
21
|
+
- Multi-source scraping: crates.io, docs.rs, lib.rs
|
22
|
+
- Structured data extraction with quality scoring
|
23
|
+
- LLM-powered content analysis when configured
|
24
|
+
- Fallback support for basic scraping mode
|
25
|
+
- **Improved Error Handling**: Better graceful degradation when enhanced scraping is unavailable
|
26
|
+
- **Enhanced Logging**: More informative status messages with consistent formatting
|
27
|
+
|
28
|
+
### Technical Improvements
|
29
|
+
- **Import Structure**: Cleaner module imports following PEP8 guidelines
|
30
|
+
- **Configuration**: Better handling of optional dependencies
|
31
|
+
- **Testing**: Enhanced scraping functionality now properly tested and validated
|
32
|
+
|
33
|
+
### Dependencies
|
34
|
+
- Added: `crawl4ai>=0.6.0`
|
35
|
+
- Added: `playwright>=1.49.0`
|
36
|
+
- Updated: All existing dependencies to latest compatible versions
|
37
|
+
|
38
|
+
---
|
39
|
+
|
40
|
+
**Note**: This release fully resolves the "Enhanced Scraping not Available" issue and provides a robust web scraping foundation for the pipeline.
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# Changelog v1.3.6
|
2
|
+
|
3
|
+
## [1.3.6] - 2025-01-21
|
4
|
+
|
5
|
+
### Changed
|
6
|
+
- **BREAKING**: Updated Python version requirement from 3.9+ to 3.12+
|
7
|
+
- Updated all type annotations to use modern syntax (dict[str, Any] instead of Dict[str, Any])
|
8
|
+
- Removed support for Python 3.8, 3.9, 3.10, and 3.11
|
9
|
+
- Updated classifiers in pyproject.toml to reflect new Python version support
|
10
|
+
|
11
|
+
### Technical Improvements
|
12
|
+
- Leveraged Python 3.12+ features for better type safety and performance
|
13
|
+
- Simplified type annotations throughout the codebase
|
14
|
+
- Improved compatibility with modern Python tooling and linters
|
15
|
+
- Enhanced code readability with modern Python syntax
|
16
|
+
- Added `from __future__ import annotations` to enable lazy type evaluation
|
17
|
+
|
18
|
+
### Documentation
|
19
|
+
- Updated README.md to clearly specify Python 3.12+ requirement
|
20
|
+
- Added requirements section with detailed system dependencies
|
21
|
+
- Updated installation instructions to reflect new version requirements
|
22
|
+
|
23
|
+
### Build System
|
24
|
+
- Updated pyproject.toml with new Python version constraint
|
25
|
+
- Updated setup.py to match pyproject.toml requirements
|
26
|
+
- Improved build process compatibility with modern Python versions
|
27
|
+
|
28
|
+
### Compatibility
|
29
|
+
- This version is **not backward compatible** with Python versions below 3.12
|
30
|
+
- Users must upgrade to Python 3.12 or higher to use this version
|
31
|
+
- All modern type annotations now use the simplified syntax introduced in Python 3.9+
|
32
|
+
|
33
|
+
### Migration Notes
|
34
|
+
- If you're currently using Python 3.11 or earlier, you'll need to upgrade to Python 3.12+
|
35
|
+
- No code changes are required for existing users, only Python version upgrade
|
36
|
+
- All existing functionality remains the same with improved type safety
|
@@ -1,40 +1,38 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: rust-crate-pipeline
|
3
|
-
Version: 1.3.
|
4
|
-
Summary: A comprehensive
|
5
|
-
Home-page: https://github.com/
|
6
|
-
Author:
|
7
|
-
Author-email:
|
8
|
-
License
|
3
|
+
Version: 1.3.6
|
4
|
+
Summary: A comprehensive pipeline for analyzing Rust crates with AI enrichment and enhanced scraping
|
5
|
+
Home-page: https://github.com/SigilDERG/rust-crate-pipeline
|
6
|
+
Author: SigilDERG Team
|
7
|
+
Author-email: SigilDERG Team <sigilderg@example.com>
|
8
|
+
License: MIT
|
9
9
|
Project-URL: Homepage, https://github.com/Superuser666-Sigil/SigilDERG-Data_Production
|
10
10
|
Project-URL: Documentation, https://github.com/Superuser666-Sigil/SigilDERG-Data_Production#readme
|
11
11
|
Project-URL: Repository, https://github.com/Superuser666-Sigil/SigilDERG-Data_Production
|
12
12
|
Project-URL: Bug Tracker, https://github.com/Superuser666-Sigil/SigilDERG-Data_Production/issues
|
13
|
-
Keywords: rust,crates,
|
13
|
+
Keywords: rust,crates,analysis,ai,pipeline,scraping
|
14
14
|
Classifier: Development Status :: 4 - Beta
|
15
15
|
Classifier: Intended Audience :: Developers
|
16
|
-
Classifier:
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
17
17
|
Classifier: Programming Language :: Python :: 3
|
18
|
-
Classifier: Programming Language :: Python :: 3.8
|
19
|
-
Classifier: Programming Language :: Python :: 3.9
|
20
|
-
Classifier: Programming Language :: Python :: 3.10
|
21
|
-
Classifier: Programming Language :: Python :: 3.11
|
22
18
|
Classifier: Programming Language :: Python :: 3.12
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
23
20
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
24
|
-
Classifier: Topic :: Software Development :: Build Tools
|
25
21
|
Classifier: Topic :: Software Development :: Quality Assurance
|
26
|
-
|
27
|
-
Requires-Python: >=3.8
|
22
|
+
Requires-Python: >=3.12
|
28
23
|
Description-Content-Type: text/markdown
|
29
24
|
License-File: LICENSE
|
30
25
|
Requires-Dist: requests>=2.28.0
|
31
26
|
Requires-Dist: requests-cache>=1.0.0
|
32
27
|
Requires-Dist: beautifulsoup4>=4.11.0
|
28
|
+
Requires-Dist: crawl4ai>=0.6.0
|
29
|
+
Requires-Dist: playwright>=1.49.0
|
33
30
|
Requires-Dist: tqdm>=4.64.0
|
34
31
|
Requires-Dist: llama-cpp-python>=0.2.0
|
35
32
|
Requires-Dist: tiktoken>=0.5.0
|
36
33
|
Requires-Dist: psutil>=5.9.0
|
37
34
|
Requires-Dist: python-dateutil>=2.8.0
|
35
|
+
Requires-Dist: litellm>=1.0.0
|
38
36
|
Provides-Extra: dev
|
39
37
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
40
38
|
Requires-Dist: black>=22.0.0; extra == "dev"
|
@@ -57,7 +55,7 @@ The Rust Crate Pipeline is designed to collect, process, and enrich metadata fro
|
|
57
55
|
|
58
56
|
## Features
|
59
57
|
|
60
|
-
- **Web Scraping**: Automated collection of crate metadata from crates.io using Crawl4AI
|
58
|
+
- **Enhanced Web Scraping**: Automated collection of crate metadata from crates.io using Crawl4AI with Playwright
|
61
59
|
- **AI Enrichment**: Local and Azure OpenAI-powered analysis of crate descriptions, features, and documentation
|
62
60
|
- **Multi-Provider LLM Support**: Unified LLM processor supporting OpenAI, Azure OpenAI, Ollama, LM Studio, and LiteLLM
|
63
61
|
- **Cargo Testing**: Automated cargo build, test, and audit execution for comprehensive crate analysis
|
@@ -66,6 +64,15 @@ The Rust Crate Pipeline is designed to collect, process, and enrich metadata fro
|
|
66
64
|
- **Data Export**: Structured output in JSON format for further analysis
|
67
65
|
- **RAG Cache**: Intelligent caching with Rule Zero policies and architectural patterns
|
68
66
|
- **Docker Support**: Containerized deployment with optimized Docker configurations
|
67
|
+
- **Real-time Progress Monitoring**: CLI-based progress tracking with ASCII status indicators
|
68
|
+
- **Cross-platform Compatibility**: Full Unicode symbol replacement for better encoding support
|
69
|
+
|
70
|
+
## Requirements
|
71
|
+
|
72
|
+
- **Python 3.12+**: Required for modern type annotations and language features
|
73
|
+
- **Git**: For cloning repositories during analysis
|
74
|
+
- **Cargo**: For Rust crate testing and analysis
|
75
|
+
- **Playwright**: Automatically installed for enhanced web scraping
|
69
76
|
|
70
77
|
## Installation
|
71
78
|
|
@@ -74,13 +81,22 @@ The Rust Crate Pipeline is designed to collect, process, and enrich metadata fro
|
|
74
81
|
git clone https://github.com/Superuser666-Sigil/SigilDERG-Data_Production.git
|
75
82
|
cd SigilDERG-Data_Production
|
76
83
|
|
77
|
-
# Install in development mode
|
84
|
+
# Install in development mode (includes all dependencies)
|
78
85
|
pip install -e .
|
79
86
|
|
80
|
-
# Install
|
81
|
-
|
87
|
+
# Install Playwright browsers for enhanced scraping
|
88
|
+
playwright install
|
82
89
|
```
|
83
90
|
|
91
|
+
### Automatic Dependency Installation
|
92
|
+
|
93
|
+
The package automatically installs all required dependencies including:
|
94
|
+
- `crawl4ai` for web scraping
|
95
|
+
- `playwright` for enhanced browser automation
|
96
|
+
- `requests` for HTTP requests
|
97
|
+
- `aiohttp` for async operations
|
98
|
+
- And all other required packages
|
99
|
+
|
84
100
|
## Configuration
|
85
101
|
|
86
102
|
### Environment Variables
|
@@ -160,6 +176,27 @@ python -m rust_crate_pipeline --checkpoint-interval 5
|
|
160
176
|
|
161
177
|
# Enable verbose logging
|
162
178
|
python -m rust_crate_pipeline --log-level DEBUG
|
179
|
+
|
180
|
+
# Enable enhanced scraping with Playwright
|
181
|
+
python -m rust_crate_pipeline --enable-enhanced-scraping
|
182
|
+
|
183
|
+
# Set output directory for results
|
184
|
+
python -m rust_crate_pipeline --output-path ./results
|
185
|
+
```
|
186
|
+
|
187
|
+
#### Enhanced Scraping
|
188
|
+
|
189
|
+
The pipeline now supports enhanced web scraping using Playwright for better data extraction:
|
190
|
+
|
191
|
+
```bash
|
192
|
+
# Enable enhanced scraping (default)
|
193
|
+
python -m rust_crate_pipeline --enable-enhanced-scraping
|
194
|
+
|
195
|
+
# Use basic scraping only
|
196
|
+
python -m rust_crate_pipeline --disable-enhanced-scraping
|
197
|
+
|
198
|
+
# Configure scraping options
|
199
|
+
python -m rust_crate_pipeline --scraping-config '{"max_pages": 10, "concurrency": 3}'
|
163
200
|
```
|
164
201
|
|
165
202
|
#### Multi-Provider LLM Support
|
@@ -280,6 +317,12 @@ clap
|
|
280
317
|
|
281
318
|
## Development
|
282
319
|
|
320
|
+
### Prerequisites
|
321
|
+
|
322
|
+
- Python 3.12+ (required for modern type annotations)
|
323
|
+
- Git for version control
|
324
|
+
- Cargo for Rust crate testing
|
325
|
+
|
283
326
|
### Running Tests
|
284
327
|
|
285
328
|
```bash
|
@@ -291,6 +334,12 @@ pytest tests/test_main_integration.py
|
|
291
334
|
|
292
335
|
# Run with coverage
|
293
336
|
pytest --cov=rust_crate_pipeline tests/
|
337
|
+
|
338
|
+
# Run type checking
|
339
|
+
pyright rust_crate_pipeline/
|
340
|
+
|
341
|
+
# Run linting
|
342
|
+
flake8 rust_crate_pipeline/
|
294
343
|
```
|
295
344
|
|
296
345
|
### Code Quality
|
@@ -304,14 +353,64 @@ isort rust_crate_pipeline/
|
|
304
353
|
|
305
354
|
# Type checking
|
306
355
|
pyright rust_crate_pipeline/
|
356
|
+
|
357
|
+
# Lint code
|
358
|
+
flake8 rust_crate_pipeline/
|
307
359
|
```
|
308
360
|
|
309
|
-
|
361
|
+
### Building and Publishing
|
362
|
+
|
363
|
+
```bash
|
364
|
+
# Build package
|
365
|
+
python -m build
|
366
|
+
|
367
|
+
# Upload to PyPI (requires PYPI_API_TOKEN)
|
368
|
+
python -m twine upload dist/*
|
369
|
+
|
370
|
+
# Create release
|
371
|
+
python scripts/create_release.py
|
372
|
+
```
|
373
|
+
|
374
|
+
### Docker Development
|
375
|
+
|
376
|
+
```bash
|
377
|
+
# Build Docker image
|
378
|
+
docker build -t rust-crate-pipeline .
|
379
|
+
|
380
|
+
# Run in Docker
|
381
|
+
docker run -it rust-crate-pipeline
|
382
|
+
|
383
|
+
# Run with volume mount for development
|
384
|
+
docker run -it -v $(pwd):/app rust-crate-pipeline
|
385
|
+
```
|
310
386
|
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
-
|
387
|
+
## Recent Improvements
|
388
|
+
|
389
|
+
### Version 1.3.6
|
390
|
+
- **Python 3.12+ Requirement**: Updated to use modern type annotations and language features
|
391
|
+
- **Type Safety**: Enhanced type annotations throughout the codebase with modern syntax
|
392
|
+
- **Build System**: Updated pyproject.toml and setup.py for better compatibility
|
393
|
+
|
394
|
+
### Version 1.3.5
|
395
|
+
- **Enhanced Web Scraping**: Added Playwright-based scraping for better data extraction
|
396
|
+
- **Unicode Compatibility**: Replaced all Unicode symbols with ASCII equivalents for better cross-platform support
|
397
|
+
- **Automatic Dependencies**: All required packages are now automatically installed
|
398
|
+
- **Real-time Progress**: Added CLI-based progress monitoring with ASCII status indicators
|
399
|
+
- **Docker Optimization**: Updated Dockerfile to include Playwright browser installation
|
400
|
+
|
401
|
+
### Version 1.3.4
|
402
|
+
- **PEP8 Compliance**: Fixed all Unicode emoji and symbols for better encoding support
|
403
|
+
- **Cross-platform Compatibility**: Improved compatibility across different operating systems
|
404
|
+
- **Type Safety**: Enhanced type annotations throughout the codebase
|
405
|
+
|
406
|
+
### Version 1.3.3
|
407
|
+
- **Real-time Progress Monitoring**: Added CLI-only progress tracking feature
|
408
|
+
- **Enhanced Logging**: Improved status reporting and error handling
|
409
|
+
|
410
|
+
### Version 1.3.2
|
411
|
+
- **Multi-Provider LLM Support**: Added support for OpenAI, Azure OpenAI, Ollama, LM Studio, and LiteLLM
|
412
|
+
- **Unified LLM Processor**: Centralized LLM processing with provider abstraction
|
413
|
+
- **Enhanced Error Handling**: Better error recovery and retry mechanisms
|
315
414
|
|
316
415
|
## License
|
317
416
|
|
@@ -8,7 +8,7 @@ The Rust Crate Pipeline is designed to collect, process, and enrich metadata fro
|
|
8
8
|
|
9
9
|
## Features
|
10
10
|
|
11
|
-
- **Web Scraping**: Automated collection of crate metadata from crates.io using Crawl4AI
|
11
|
+
- **Enhanced Web Scraping**: Automated collection of crate metadata from crates.io using Crawl4AI with Playwright
|
12
12
|
- **AI Enrichment**: Local and Azure OpenAI-powered analysis of crate descriptions, features, and documentation
|
13
13
|
- **Multi-Provider LLM Support**: Unified LLM processor supporting OpenAI, Azure OpenAI, Ollama, LM Studio, and LiteLLM
|
14
14
|
- **Cargo Testing**: Automated cargo build, test, and audit execution for comprehensive crate analysis
|
@@ -17,6 +17,15 @@ The Rust Crate Pipeline is designed to collect, process, and enrich metadata fro
|
|
17
17
|
- **Data Export**: Structured output in JSON format for further analysis
|
18
18
|
- **RAG Cache**: Intelligent caching with Rule Zero policies and architectural patterns
|
19
19
|
- **Docker Support**: Containerized deployment with optimized Docker configurations
|
20
|
+
- **Real-time Progress Monitoring**: CLI-based progress tracking with ASCII status indicators
|
21
|
+
- **Cross-platform Compatibility**: Full Unicode symbol replacement for better encoding support
|
22
|
+
|
23
|
+
## Requirements
|
24
|
+
|
25
|
+
- **Python 3.12+**: Required for modern type annotations and language features
|
26
|
+
- **Git**: For cloning repositories during analysis
|
27
|
+
- **Cargo**: For Rust crate testing and analysis
|
28
|
+
- **Playwright**: Automatically installed for enhanced web scraping
|
20
29
|
|
21
30
|
## Installation
|
22
31
|
|
@@ -25,13 +34,22 @@ The Rust Crate Pipeline is designed to collect, process, and enrich metadata fro
|
|
25
34
|
git clone https://github.com/Superuser666-Sigil/SigilDERG-Data_Production.git
|
26
35
|
cd SigilDERG-Data_Production
|
27
36
|
|
28
|
-
# Install in development mode
|
37
|
+
# Install in development mode (includes all dependencies)
|
29
38
|
pip install -e .
|
30
39
|
|
31
|
-
# Install
|
32
|
-
|
40
|
+
# Install Playwright browsers for enhanced scraping
|
41
|
+
playwright install
|
33
42
|
```
|
34
43
|
|
44
|
+
### Automatic Dependency Installation
|
45
|
+
|
46
|
+
The package automatically installs all required dependencies including:
|
47
|
+
- `crawl4ai` for web scraping
|
48
|
+
- `playwright` for enhanced browser automation
|
49
|
+
- `requests` for HTTP requests
|
50
|
+
- `aiohttp` for async operations
|
51
|
+
- And all other required packages
|
52
|
+
|
35
53
|
## Configuration
|
36
54
|
|
37
55
|
### Environment Variables
|
@@ -111,6 +129,27 @@ python -m rust_crate_pipeline --checkpoint-interval 5
|
|
111
129
|
|
112
130
|
# Enable verbose logging
|
113
131
|
python -m rust_crate_pipeline --log-level DEBUG
|
132
|
+
|
133
|
+
# Enable enhanced scraping with Playwright
|
134
|
+
python -m rust_crate_pipeline --enable-enhanced-scraping
|
135
|
+
|
136
|
+
# Set output directory for results
|
137
|
+
python -m rust_crate_pipeline --output-path ./results
|
138
|
+
```
|
139
|
+
|
140
|
+
#### Enhanced Scraping
|
141
|
+
|
142
|
+
The pipeline now supports enhanced web scraping using Playwright for better data extraction:
|
143
|
+
|
144
|
+
```bash
|
145
|
+
# Enable enhanced scraping (default)
|
146
|
+
python -m rust_crate_pipeline --enable-enhanced-scraping
|
147
|
+
|
148
|
+
# Use basic scraping only
|
149
|
+
python -m rust_crate_pipeline --disable-enhanced-scraping
|
150
|
+
|
151
|
+
# Configure scraping options
|
152
|
+
python -m rust_crate_pipeline --scraping-config '{"max_pages": 10, "concurrency": 3}'
|
114
153
|
```
|
115
154
|
|
116
155
|
#### Multi-Provider LLM Support
|
@@ -231,6 +270,12 @@ clap
|
|
231
270
|
|
232
271
|
## Development
|
233
272
|
|
273
|
+
### Prerequisites
|
274
|
+
|
275
|
+
- Python 3.12+ (required for modern type annotations)
|
276
|
+
- Git for version control
|
277
|
+
- Cargo for Rust crate testing
|
278
|
+
|
234
279
|
### Running Tests
|
235
280
|
|
236
281
|
```bash
|
@@ -242,6 +287,12 @@ pytest tests/test_main_integration.py
|
|
242
287
|
|
243
288
|
# Run with coverage
|
244
289
|
pytest --cov=rust_crate_pipeline tests/
|
290
|
+
|
291
|
+
# Run type checking
|
292
|
+
pyright rust_crate_pipeline/
|
293
|
+
|
294
|
+
# Run linting
|
295
|
+
flake8 rust_crate_pipeline/
|
245
296
|
```
|
246
297
|
|
247
298
|
### Code Quality
|
@@ -255,14 +306,64 @@ isort rust_crate_pipeline/
|
|
255
306
|
|
256
307
|
# Type checking
|
257
308
|
pyright rust_crate_pipeline/
|
309
|
+
|
310
|
+
# Lint code
|
311
|
+
flake8 rust_crate_pipeline/
|
258
312
|
```
|
259
313
|
|
260
|
-
|
314
|
+
### Building and Publishing
|
315
|
+
|
316
|
+
```bash
|
317
|
+
# Build package
|
318
|
+
python -m build
|
319
|
+
|
320
|
+
# Upload to PyPI (requires PYPI_API_TOKEN)
|
321
|
+
python -m twine upload dist/*
|
322
|
+
|
323
|
+
# Create release
|
324
|
+
python scripts/create_release.py
|
325
|
+
```
|
326
|
+
|
327
|
+
### Docker Development
|
328
|
+
|
329
|
+
```bash
|
330
|
+
# Build Docker image
|
331
|
+
docker build -t rust-crate-pipeline .
|
332
|
+
|
333
|
+
# Run in Docker
|
334
|
+
docker run -it rust-crate-pipeline
|
335
|
+
|
336
|
+
# Run with volume mount for development
|
337
|
+
docker run -it -v $(pwd):/app rust-crate-pipeline
|
338
|
+
```
|
261
339
|
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
-
|
340
|
+
## Recent Improvements
|
341
|
+
|
342
|
+
### Version 1.3.6
|
343
|
+
- **Python 3.12+ Requirement**: Updated to use modern type annotations and language features
|
344
|
+
- **Type Safety**: Enhanced type annotations throughout the codebase with modern syntax
|
345
|
+
- **Build System**: Updated pyproject.toml and setup.py for better compatibility
|
346
|
+
|
347
|
+
### Version 1.3.5
|
348
|
+
- **Enhanced Web Scraping**: Added Playwright-based scraping for better data extraction
|
349
|
+
- **Unicode Compatibility**: Replaced all Unicode symbols with ASCII equivalents for better cross-platform support
|
350
|
+
- **Automatic Dependencies**: All required packages are now automatically installed
|
351
|
+
- **Real-time Progress**: Added CLI-based progress monitoring with ASCII status indicators
|
352
|
+
- **Docker Optimization**: Updated Dockerfile to include Playwright browser installation
|
353
|
+
|
354
|
+
### Version 1.3.4
|
355
|
+
- **PEP8 Compliance**: Fixed all Unicode emoji and symbols for better encoding support
|
356
|
+
- **Cross-platform Compatibility**: Improved compatibility across different operating systems
|
357
|
+
- **Type Safety**: Enhanced type annotations throughout the codebase
|
358
|
+
|
359
|
+
### Version 1.3.3
|
360
|
+
- **Real-time Progress Monitoring**: Added CLI-only progress tracking feature
|
361
|
+
- **Enhanced Logging**: Improved status reporting and error handling
|
362
|
+
|
363
|
+
### Version 1.3.2
|
364
|
+
- **Multi-Provider LLM Support**: Added support for OpenAI, Azure OpenAI, Ollama, LM Studio, and LiteLLM
|
365
|
+
- **Unified LLM Processor**: Centralized LLM processing with provider abstraction
|
366
|
+
- **Enhanced Error Handling**: Better error recovery and retry mechanisms
|
266
367
|
|
267
368
|
## License
|
268
369
|
|
@@ -4,40 +4,38 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "rust-crate-pipeline"
|
7
|
-
version = "1.3.
|
7
|
+
version = "1.3.6"
|
8
8
|
authors = [
|
9
|
-
{name = "
|
9
|
+
{name = "SigilDERG Team", email = "sigilderg@example.com"}
|
10
10
|
]
|
11
|
-
description = "A comprehensive
|
11
|
+
description = "A comprehensive pipeline for analyzing Rust crates with AI enrichment and enhanced scraping"
|
12
12
|
readme = "README.md"
|
13
|
-
license = "MIT"
|
14
|
-
requires-python = ">=3.
|
13
|
+
license = {text = "MIT"}
|
14
|
+
requires-python = ">=3.12"
|
15
15
|
classifiers = [
|
16
16
|
"Development Status :: 4 - Beta",
|
17
17
|
"Intended Audience :: Developers",
|
18
|
-
"
|
18
|
+
"License :: OSI Approved :: MIT License",
|
19
19
|
"Programming Language :: Python :: 3",
|
20
|
-
"Programming Language :: Python :: 3.8",
|
21
|
-
"Programming Language :: Python :: 3.9",
|
22
|
-
"Programming Language :: Python :: 3.10",
|
23
|
-
"Programming Language :: Python :: 3.11",
|
24
20
|
"Programming Language :: Python :: 3.12",
|
21
|
+
"Programming Language :: Python :: 3.13",
|
25
22
|
"Topic :: Software Development :: Libraries :: Python Modules",
|
26
|
-
"Topic :: Software Development :: Build Tools",
|
27
23
|
"Topic :: Software Development :: Quality Assurance",
|
28
|
-
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
29
24
|
]
|
30
|
-
keywords = ["rust", "crates", "
|
25
|
+
keywords = ["rust", "crates", "analysis", "ai", "pipeline", "scraping"]
|
31
26
|
|
32
27
|
dependencies = [
|
33
28
|
"requests>=2.28.0",
|
34
29
|
"requests-cache>=1.0.0",
|
35
30
|
"beautifulsoup4>=4.11.0",
|
31
|
+
"crawl4ai>=0.6.0",
|
32
|
+
"playwright>=1.49.0",
|
36
33
|
"tqdm>=4.64.0",
|
37
34
|
"llama-cpp-python>=0.2.0",
|
38
35
|
"tiktoken>=0.5.0",
|
39
36
|
"psutil>=5.9.0",
|
40
37
|
"python-dateutil>=2.8.0",
|
38
|
+
"litellm>=1.0.0",
|
41
39
|
]
|
42
40
|
|
43
41
|
[project.optional-dependencies]
|