rust-crate-pipeline 1.4.0__py3-none-any.whl → 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/config.py +3 -3
- rust_crate_pipeline/main.py +4 -5
- rust_crate_pipeline/version.py +23 -2
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.5.1.dist-info}/METADATA +79 -6
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.5.1.dist-info}/RECORD +9 -9
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.5.1.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.5.1.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.5.1.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.5.1.dist-info}/top_level.txt +0 -0
rust_crate_pipeline/config.py
CHANGED
@@ -24,10 +24,10 @@ class PipelineConfig:
|
|
24
24
|
github_token: str = os.getenv("GITHUB_TOKEN", "")
|
25
25
|
cache_ttl: int = 3600 # 1 hour
|
26
26
|
batch_size: int = 10
|
27
|
-
n_workers: int = 4
|
28
|
-
# Enhanced scraping configuration
|
27
|
+
n_workers: int = 4 # Enhanced scraping configuration
|
29
28
|
enable_crawl4ai: bool = True
|
30
|
-
crawl4ai_model: str =
|
29
|
+
crawl4ai_model: str = os.path.expanduser(
|
30
|
+
"~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf")
|
31
31
|
crawl4ai_timeout: int = 30
|
32
32
|
|
33
33
|
|
rust_crate_pipeline/main.py
CHANGED
@@ -104,14 +104,13 @@ Examples:
|
|
104
104
|
parser.add_argument(
|
105
105
|
'--disable-crawl4ai',
|
106
106
|
action='store_true',
|
107
|
-
help='Disable Crawl4AI enhanced scraping (use basic scraping only)'
|
108
|
-
)
|
107
|
+
help='Disable Crawl4AI enhanced scraping (use basic scraping only)' )
|
109
108
|
|
110
109
|
parser.add_argument(
|
111
110
|
'--crawl4ai-model',
|
112
111
|
type=str,
|
113
|
-
default='
|
114
|
-
help='
|
112
|
+
default='~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf',
|
113
|
+
help='GGUF model path for Crawl4AI content analysis (default: ~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf)'
|
115
114
|
)
|
116
115
|
|
117
116
|
parser.add_argument(
|
@@ -245,7 +244,7 @@ def main():
|
|
245
244
|
args, 'disable_crawl4ai') else True
|
246
245
|
config_kwargs.update({
|
247
246
|
'enable_crawl4ai': enable_crawl4ai,
|
248
|
-
'crawl4ai_model': getattr(args, 'crawl4ai_model', '
|
247
|
+
'crawl4ai_model': getattr(args, 'crawl4ai_model', '~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf')
|
249
248
|
})
|
250
249
|
|
251
250
|
config = PipelineConfig(**config_kwargs)
|
rust_crate_pipeline/version.py
CHANGED
@@ -1,9 +1,30 @@
|
|
1
|
-
"""Version
|
1
|
+
"""Version inf - New CLI options: --enable-crawl4ai, --disable-crawl4ai, --crawl4ai-model
|
2
|
+
- Enhanced configuration with local GGUF model paths and crawl4ai_timeoutmation for rust-crate-pipeline."""
|
2
3
|
|
3
|
-
__version__ = "1.
|
4
|
+
__version__ = "1.5.1"
|
4
5
|
__version_info__ = tuple(int(x) for x in __version__.split("."))
|
5
6
|
|
6
7
|
# Version history
|
8
|
+
# 1.5.1 - Configuration Standardization Release: Model Path Consistency
|
9
|
+
# - Standardized all configuration to use GGUF model paths
|
10
|
+
# - Updated CLI defaults for --crawl4ai-model to ~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf
|
11
|
+
# - Enhanced Rule Zero alignment with transparent configuration practices
|
12
|
+
# - Updated all test files to use consistent GGUF model path references
|
13
|
+
# - Comprehensive documentation updates for proper model configuration
|
14
|
+
# - Removed inconsistent Ollama references in favor of llama-cpp-python
|
15
|
+
# - Ensured CLI help text and JSON examples reflect correct model paths
|
16
|
+
# 1.5.0 - Major Release: Enhanced Web Scraping with Crawl4AI Integration
|
17
|
+
# - Integrated Crawl4AI for advanced web scraping capabilities
|
18
|
+
# - Added JavaScript-rendered content extraction via Playwright
|
19
|
+
# - Enhanced README parsing with LLM-powered content analysis
|
20
|
+
# - Implemented structured data extraction from docs.rs
|
21
|
+
# - Added quality scoring for scraped content
|
22
|
+
# - Graceful fallback to basic scraping when Crawl4AI unavailable
|
23
|
+
# - Full async processing for improved performance
|
24
|
+
# - New CLI options: --enable-crawl4ai, --disable-crawl4ai, --crawl4ai-model
|
25
|
+
# - Enhanced configuration with crawl4ai_model and crawl4ai_timeout
|
26
|
+
# - Comprehensive test coverage for all Crawl4AI features
|
27
|
+
# - Rule Zero compliant with full transparency and audit trails
|
7
28
|
# 1.4.0 - Major Release: Rule Zero Compliance Audit Complete
|
8
29
|
# - Completed comprehensive Rule Zero alignment audit
|
9
30
|
# - Eliminated all code redundancy and dead code
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: rust-crate-pipeline
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.5.1
|
4
4
|
Summary: A comprehensive system for gathering, enriching, and analyzing metadata for Rust crates using AI-powered insights
|
5
5
|
Home-page: https://github.com/Superuser666-Sigil/SigilDERG-Data_Production
|
6
6
|
Author: SuperUser666-Sigil
|
@@ -51,11 +51,13 @@ Dynamic: requires-python
|
|
51
51
|
|
52
52
|
[](https://www.python.org/downloads/)
|
53
53
|
[](https://opensource.org/licenses/MIT)
|
54
|
-
[](https://pypi.org/project/rust-crate-pipeline/)
|
55
55
|
[](https://docker.com/)
|
56
56
|
[](https://github.com/Superuser666-Sigil/SigilDERG-Data_Production/blob/main/SYSTEM_AUDIT_REPORT.md)
|
57
57
|
|
58
|
-
A production-ready, Rule Zero-compliant pipeline for comprehensive Rust crate analysis, featuring AI-powered insights
|
58
|
+
A production-ready, Rule Zero-compliant pipeline for comprehensive Rust crate analysis, featuring **AI-powered insights**, **enhanced web scraping with Crawl4AI**, dependency mapping, and automated data enrichment. Designed for researchers, developers, and data scientists studying the Rust ecosystem.
|
59
|
+
|
60
|
+
**🆕 New in v1.5.1**: Model path standardization, improved GGUF configuration consistency, and enhanced Rule Zero alignment.
|
59
61
|
|
60
62
|
📦 **Available on PyPI:** [rust-crate-pipeline](https://pypi.org/project/rust-crate-pipeline/)
|
61
63
|
|
@@ -126,6 +128,25 @@ python3 -m rust_crate_pipeline --skip-ai --limit 50
|
|
126
128
|
### 4. Advanced Usage
|
127
129
|
|
128
130
|
```bash
|
131
|
+
# Enhanced web scraping with Crawl4AI (default in v1.5.0)
|
132
|
+
python3 -m rust_crate_pipeline --enable-crawl4ai --limit 20
|
133
|
+
|
134
|
+
# Disable Crawl4AI for basic scraping only
|
135
|
+
python3 -m rust_crate_pipeline --disable-crawl4ai --limit 20
|
136
|
+
|
137
|
+
# Custom Crawl4AI model configuration
|
138
|
+
python3 -m rust_crate_pipeline \
|
139
|
+
--enable-crawl4ai \
|
140
|
+
--crawl4ai-model "~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf" \
|
141
|
+
--limit 10
|
142
|
+
|
143
|
+
# Sigil Protocol with enhanced scraping
|
144
|
+
python3 -m rust_crate_pipeline \
|
145
|
+
--enable-sigil-protocol \
|
146
|
+
--enable-crawl4ai \
|
147
|
+
--skip-ai \
|
148
|
+
--limit 5
|
149
|
+
|
129
150
|
# Custom configuration
|
130
151
|
python3 -m rust_crate_pipeline \
|
131
152
|
--limit 100 \
|
@@ -147,7 +168,16 @@ python3 -m rust_crate_pipeline \
|
|
147
168
|
|
148
169
|
## 🎯 Features
|
149
170
|
|
150
|
-
*Available in the latest version: [rust-crate-pipeline v1.
|
171
|
+
*Available in the latest version: [rust-crate-pipeline v1.5.1](https://pypi.org/project/rust-crate-pipeline/)*
|
172
|
+
|
173
|
+
### 🌐 Enhanced Web Scraping (New in v1.5.0)
|
174
|
+
|
175
|
+
- **Crawl4AI Integration**: Advanced web scraping with AI-powered content extraction
|
176
|
+
- **JavaScript Rendering**: Playwright-powered browser automation for dynamic content
|
177
|
+
- **Smart Content Analysis**: LLM-enhanced README and documentation parsing
|
178
|
+
- **Structured Data Extraction**: Intelligent parsing of docs.rs and technical documentation
|
179
|
+
- **Quality Scoring**: Automated content quality assessment and validation
|
180
|
+
- **Graceful Fallbacks**: Automatic degradation to basic scraping when needed
|
151
181
|
|
152
182
|
### 📊 Data Collection & Analysis
|
153
183
|
|
@@ -171,8 +201,35 @@ python3 -m rust_crate_pipeline \
|
|
171
201
|
- **Robust error handling**: Graceful degradation and comprehensive logging
|
172
202
|
- **Progress checkpointing**: Automatic saving for long-running processes
|
173
203
|
- **Docker ready**: Full container support with optimized configurations
|
204
|
+
- **Rule Zero Compliance**: Full transparency and audit trail support
|
174
205
|
|
175
|
-
##
|
206
|
+
## � Recent Updates
|
207
|
+
|
208
|
+
### Version 1.5.1 - Configuration Standardization (Latest)
|
209
|
+
- 🔧 **Model Path Consistency**: Standardized all configuration to use GGUF model paths (`~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf`)
|
210
|
+
- ⚖️ **Rule Zero Alignment**: Enhanced compliance with Rule Zero principles for transparency and validation
|
211
|
+
- 📝 **Documentation Updates**: Comprehensive updates to reflect proper model configuration practices
|
212
|
+
- 🧪 **Test Standardization**: Updated all test files to use consistent GGUF model paths
|
213
|
+
- 🚀 **CLI Consistency**: Ensured all CLI defaults and help text reflect correct model paths
|
214
|
+
|
215
|
+
### Version 1.5.0 - Enhanced Web Scraping
|
216
|
+
- 🚀 **Crawl4AI Integration**: Advanced web scraping with AI-powered content extraction
|
217
|
+
- 🌐 **JavaScript Rendering**: Playwright-powered browser automation for dynamic content
|
218
|
+
- 🧠 **LLM-Enhanced Parsing**: AI-powered README and documentation analysis
|
219
|
+
- 📊 **Structured Data Extraction**: Intelligent parsing of docs.rs and technical documentation
|
220
|
+
- ⚡ **Async Processing**: High-performance concurrent web scraping
|
221
|
+
- 🛡️ **Graceful Fallbacks**: Automatic degradation to basic scraping when needed
|
222
|
+
|
223
|
+
### Version 1.4.0 - Rule Zero Compliance
|
224
|
+
- 🏆 **Rule Zero Certification**: Complete alignment audit and compliance verification
|
225
|
+
- 🧪 **100% Test Coverage**: All 22 tests passing with comprehensive validation
|
226
|
+
- 🔄 **Thread-Free Architecture**: Pure asyncio implementation for better performance
|
227
|
+
- 📦 **PyPI Integration**: Official package availability with easy installation
|
228
|
+
- 🐳 **Docker Support**: Full containerization with production-ready configurations
|
229
|
+
|
230
|
+
*For complete version history, see [CHANGELOG.md](CHANGELOG.md)*
|
231
|
+
|
232
|
+
## �💻 System Requirements
|
176
233
|
|
177
234
|
### Minimum Requirements
|
178
235
|
|
@@ -193,12 +250,21 @@ python3 -m rust_crate_pipeline \
|
|
193
250
|
Core dependencies are automatically installed:
|
194
251
|
|
195
252
|
```bash
|
253
|
+
# Core functionality
|
196
254
|
requests>=2.28.0
|
197
255
|
requests-cache>=0.9.0
|
198
256
|
beautifulsoup4>=4.11.0
|
199
257
|
tqdm>=4.64.0
|
258
|
+
|
259
|
+
# AI and LLM processing
|
200
260
|
llama-cpp-python>=0.2.0
|
201
261
|
tiktoken>=0.4.0
|
262
|
+
|
263
|
+
# Enhanced web scraping (New in v1.5.0)
|
264
|
+
crawl4ai>=0.6.0
|
265
|
+
playwright>=1.49.0
|
266
|
+
|
267
|
+
# System utilities
|
202
268
|
psutil>=5.9.0
|
203
269
|
python-dateutil>=2.8.0
|
204
270
|
```
|
@@ -219,6 +285,11 @@ python-dateutil>=2.8.0
|
|
219
285
|
| `--log-level` | str | INFO | Logging verbosity |
|
220
286
|
| `--skip-ai` | flag | False | Skip AI enrichment |
|
221
287
|
| `--skip-source-analysis` | flag | False | Skip source code analysis |
|
288
|
+
| `--enable-crawl4ai` | flag | True | Enable enhanced web scraping (default) |
|
289
|
+
| `--disable-crawl4ai` | flag | False | Disable Crawl4AI, use basic scraping |
|
290
|
+
| `--crawl4ai-model` | str | ~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf | GGUF model path for content analysis |
|
291
|
+
| `--enable-sigil-protocol` | flag | False | Enable Rule Zero compliance mode |
|
292
|
+
| `--sigil-mode` | str | enhanced | Sigil processing mode |
|
222
293
|
| `--crate-list` | list | None | Specific crates to process |
|
223
294
|
| `--config-file` | str | None | JSON configuration file |
|
224
295
|
|
@@ -254,7 +325,9 @@ Create a JSON configuration file for custom settings:
|
|
254
325
|
"batch_size": 10,
|
255
326
|
"github_min_remaining": 500,
|
256
327
|
"cache_ttl": 7200,
|
257
|
-
"model_path": "~/models/your-model.gguf"
|
328
|
+
"model_path": "~/models/your-model.gguf", "enable_crawl4ai": true,
|
329
|
+
"crawl4ai_model": "~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf",
|
330
|
+
"crawl4ai_timeout": 30
|
258
331
|
}
|
259
332
|
```
|
260
333
|
|
@@ -2,18 +2,18 @@ rust_crate_pipeline/__init__.py,sha256=NxD8_OEGHEHUN9EfJj2S1rRyZ0UMkiF20LNSMnjL9
|
|
2
2
|
rust_crate_pipeline/__main__.py,sha256=fYgtPofuk4vkwiZ7ELP4GVMNj_QiKmZMSlvhzsNGuDs,155
|
3
3
|
rust_crate_pipeline/ai_processing.py,sha256=sj-qPtIVLuuY_VoWoLbcGQ6_eS_giQyXIPyAGAWOCrs,24814
|
4
4
|
rust_crate_pipeline/analysis.py,sha256=jcHHTBZ_zg5n4VGPXJYM7-NkNeL5hRdgvowkiim0onM,17663
|
5
|
-
rust_crate_pipeline/config.py,sha256=
|
5
|
+
rust_crate_pipeline/config.py,sha256=CeDlEZ08UDA_1DkcIfTOoPpYj3kGBZNGwsefRjBKlwg,2396
|
6
6
|
rust_crate_pipeline/github_token_checker.py,sha256=_cyOiSYc1bCVczr6pUUJc_s822ic7Qi_IW3JtI_4C0w,3796
|
7
|
-
rust_crate_pipeline/main.py,sha256=
|
7
|
+
rust_crate_pipeline/main.py,sha256=UZj2pcHAzG5MdrgHhahWnsz3MuTQfVQ6yzf91jPtli0,10224
|
8
8
|
rust_crate_pipeline/network.py,sha256=MFtn_-9MRBUSehfjLboUBGOMk8gv2edjOjHCR_YEyGc,12677
|
9
9
|
rust_crate_pipeline/pipeline.py,sha256=aOLuIpfvDbPDCvft8ppUa0vRiFVdiz2wltpi26ZJaes,22769
|
10
10
|
rust_crate_pipeline/production_config.py,sha256=24YWT68Fo2Kl8v7Hn1WgqfPrikXma9VZEuEcMr7iDik,2282
|
11
|
-
rust_crate_pipeline/version.py,sha256=
|
11
|
+
rust_crate_pipeline/version.py,sha256=BS9a-IKMe4pIl-nSmLaSJ2bDo6r87s_h8Mk5TAsrsiI,4291
|
12
12
|
rust_crate_pipeline/utils/file_utils.py,sha256=IJOBBp6-w9pnCdqyGcRNwBph_iwI_zzULCdAULGFUy0,2097
|
13
13
|
rust_crate_pipeline/utils/logging_utils.py,sha256=5-o6ohm38sH1ozjZWHPlm9Wj7yILiUzvMsLJDeu11lk,2350
|
14
|
-
rust_crate_pipeline-1.
|
15
|
-
rust_crate_pipeline-1.
|
16
|
-
rust_crate_pipeline-1.
|
17
|
-
rust_crate_pipeline-1.
|
18
|
-
rust_crate_pipeline-1.
|
19
|
-
rust_crate_pipeline-1.
|
14
|
+
rust_crate_pipeline-1.5.1.dist-info/licenses/LICENSE,sha256=tpd4XNpbssrSx9-iErATOLrOh0ivNPfO2I5MAPUpats,1088
|
15
|
+
rust_crate_pipeline-1.5.1.dist-info/METADATA,sha256=Rk8aWxLEwJJgpuTHTHmU_JsI3BY7aHk_YWaDv22rhno,21349
|
16
|
+
rust_crate_pipeline-1.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
17
|
+
rust_crate_pipeline-1.5.1.dist-info/entry_points.txt,sha256=9Rr_IRuFRIridXxUSdEJbB3ba0NnpEfKmknZXFdYRC0,70
|
18
|
+
rust_crate_pipeline-1.5.1.dist-info/top_level.txt,sha256=GUdB7RyxHLhijQxui_KTy3B8p_L2APui9C6RYa0FuaE,20
|
19
|
+
rust_crate_pipeline-1.5.1.dist-info/RECORD,,
|
File without changes
|
{rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.5.1.dist-info}/entry_points.txt
RENAMED
File without changes
|
{rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.5.1.dist-info}/licenses/LICENSE
RENAMED
File without changes
|
File without changes
|