rust-crate-pipeline 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline-1.1.0/LICENSE +21 -0
- rust_crate_pipeline-1.1.0/MANIFEST.in +11 -0
- rust_crate_pipeline-1.1.0/PKG-INFO +473 -0
- rust_crate_pipeline-1.1.0/PUBLISHING.md +93 -0
- rust_crate_pipeline-1.1.0/README.md +425 -0
- rust_crate_pipeline-1.1.0/READY_FOR_PYPI.md +107 -0
- rust_crate_pipeline-1.1.0/SETUP_GUIDE.md +177 -0
- rust_crate_pipeline-1.1.0/pyproject.toml +75 -0
- rust_crate_pipeline-1.1.0/requirements-dev.txt +21 -0
- rust_crate_pipeline-1.1.0/requirements.txt +17 -0
- rust_crate_pipeline-1.1.0/rust_crate_pipeline/__init__.py +52 -0
- rust_crate_pipeline-1.1.0/rust_crate_pipeline/__main__.py +6 -0
- rust_crate_pipeline-1.1.0/rust_crate_pipeline/ai_processing.py +396 -0
- rust_crate_pipeline-1.1.0/rust_crate_pipeline/analysis.py +435 -0
- rust_crate_pipeline-1.1.0/rust_crate_pipeline/config.py +46 -0
- rust_crate_pipeline-1.1.0/rust_crate_pipeline/main.py +177 -0
- rust_crate_pipeline-1.1.0/rust_crate_pipeline/network.py +307 -0
- rust_crate_pipeline-1.1.0/rust_crate_pipeline/pipeline.py +260 -0
- rust_crate_pipeline-1.1.0/rust_crate_pipeline/utils/file_utils.py +72 -0
- rust_crate_pipeline-1.1.0/rust_crate_pipeline/utils/logging_utils.py +66 -0
- rust_crate_pipeline-1.1.0/rust_crate_pipeline/version.py +13 -0
- rust_crate_pipeline-1.1.0/rust_crate_pipeline.egg-info/PKG-INFO +473 -0
- rust_crate_pipeline-1.1.0/rust_crate_pipeline.egg-info/SOURCES.txt +28 -0
- rust_crate_pipeline-1.1.0/rust_crate_pipeline.egg-info/dependency_links.txt +1 -0
- rust_crate_pipeline-1.1.0/rust_crate_pipeline.egg-info/entry_points.txt +2 -0
- rust_crate_pipeline-1.1.0/rust_crate_pipeline.egg-info/requires.txt +16 -0
- rust_crate_pipeline-1.1.0/rust_crate_pipeline.egg-info/top_level.txt +1 -0
- rust_crate_pipeline-1.1.0/setup.cfg +4 -0
- rust_crate_pipeline-1.1.0/setup.py +60 -0
- rust_crate_pipeline-1.1.0/tests/test_basic.py +39 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 DaveTmire85
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1,11 @@
|
|
1
|
+
include README.md
|
2
|
+
include LICENSE
|
3
|
+
include requirements.txt
|
4
|
+
include *.md
|
5
|
+
include *.txt
|
6
|
+
recursive-include rust_crate_pipeline *.py
|
7
|
+
global-exclude __pycache__
|
8
|
+
global-exclude *.py[co]
|
9
|
+
global-exclude .DS_Store
|
10
|
+
global-exclude *.so
|
11
|
+
global-exclude .git*
|
@@ -0,0 +1,473 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: rust-crate-pipeline
|
3
|
+
Version: 1.1.0
|
4
|
+
Summary: A comprehensive system for gathering, enriching, and analyzing metadata for Rust crates using AI-powered insights
|
5
|
+
Home-page: https://github.com/DaveTmire85/SigilDERG-Data_Production
|
6
|
+
Author: SuperUser666-Sigil
|
7
|
+
Author-email: SuperUser666-Sigil <miragemodularframework@gmail.com>
|
8
|
+
License-Expression: MIT
|
9
|
+
Project-URL: Homepage, https://github.com/DaveTmire85/SigilDERG-Data_Production
|
10
|
+
Project-URL: Documentation, https://github.com/DaveTmire85/SigilDERG-Data_Production#readme
|
11
|
+
Project-URL: Repository, https://github.com/DaveTmire85/SigilDERG-Data_Production
|
12
|
+
Project-URL: Bug Tracker, https://github.com/DaveTmire85/SigilDERG-Data_Production/issues
|
13
|
+
Keywords: rust,crates,metadata,ai,analysis,pipeline,dependencies
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
15
|
+
Classifier: Intended Audience :: Developers
|
16
|
+
Classifier: Operating System :: OS Independent
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
24
|
+
Classifier: Topic :: Software Development :: Build Tools
|
25
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
26
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
27
|
+
Requires-Python: >=3.8
|
28
|
+
Description-Content-Type: text/markdown
|
29
|
+
License-File: LICENSE
|
30
|
+
Requires-Dist: requests>=2.28.0
|
31
|
+
Requires-Dist: requests-cache>=1.0.0
|
32
|
+
Requires-Dist: beautifulsoup4>=4.11.0
|
33
|
+
Requires-Dist: tqdm>=4.64.0
|
34
|
+
Requires-Dist: llama-cpp-python>=0.2.0
|
35
|
+
Requires-Dist: tiktoken>=0.5.0
|
36
|
+
Requires-Dist: psutil>=5.9.0
|
37
|
+
Provides-Extra: dev
|
38
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
39
|
+
Requires-Dist: black>=22.0.0; extra == "dev"
|
40
|
+
Requires-Dist: isort>=5.10.0; extra == "dev"
|
41
|
+
Provides-Extra: advanced
|
42
|
+
Requires-Dist: radon>=6.0.0; extra == "advanced"
|
43
|
+
Requires-Dist: rustworkx>=0.13.0; extra == "advanced"
|
44
|
+
Dynamic: author
|
45
|
+
Dynamic: home-page
|
46
|
+
Dynamic: license-file
|
47
|
+
Dynamic: requires-python
|
48
|
+
|
49
|
+
# Rust Crate Data Processing Pipeline
|
50
|
+
|
51
|
+
[](https://www.python.org/downloads/)
|
52
|
+
[](https://opensource.org/licenses/MIT)
|
53
|
+
|
54
|
+
A comprehensive system for gathering, enriching, and analyzing metadata for Rust crates using AI-powered insights and dependency analysis.
|
55
|
+
|
56
|
+
## 🚀 Features
|
57
|
+
|
58
|
+
### 📊 **Comprehensive Data Collection**
|
59
|
+
- **Multi-source metadata fetching**: Pulls data from crates.io, GitHub, and lib.rs
|
60
|
+
- **Dependency analysis**: Complete dependency graphs and reverse dependency mapping
|
61
|
+
- **Code snippet extraction**: Automatically extracts Rust code examples from READMEs
|
62
|
+
- **Feature analysis**: Detailed breakdown of crate features and their dependencies
|
63
|
+
|
64
|
+
### 🤖 **AI-Powered Enrichment**
|
65
|
+
- **Use case classification**: Automatically categorizes crates (Web Framework, ML, Database, etc.)
|
66
|
+
- **Feature summarization**: AI-generated explanations of crate features
|
67
|
+
- **Factual/counterfactual pairs**: Generates training data for fact verification
|
68
|
+
- **Smart content truncation**: Intelligently preserves important README sections
|
69
|
+
|
70
|
+
### 🔍 **Advanced Analysis**
|
71
|
+
- **Source code metrics**: Lines of code, complexity analysis, API surface area
|
72
|
+
- **Security scanning**: Vulnerability checks and security pattern analysis
|
73
|
+
- **Community metrics**: GitHub activity, issue tracking, version adoption
|
74
|
+
- **Performance optimization**: Batch processing, caching, and retry logic
|
75
|
+
|
76
|
+
### ⚡ **Production-Ready Features**
|
77
|
+
- **Robust error handling**: Graceful degradation and comprehensive logging
|
78
|
+
- **Rate limiting**: Respects GitHub API limits with intelligent backoff
|
79
|
+
- **Checkpointing**: Automatic progress saving for long-running processes
|
80
|
+
- **Configurable processing**: Extensive CLI and config file options
|
81
|
+
|
82
|
+
## 📋 Prerequisites
|
83
|
+
|
84
|
+
### Required Dependencies
|
85
|
+
```bash
|
86
|
+
pip install requests requests-cache beautifulsoup4 tqdm llama-cpp-python tiktoken psutil
|
87
|
+
```
|
88
|
+
|
89
|
+
### Optional Dependencies
|
90
|
+
```bash
|
91
|
+
pip install radon rustworkx # For advanced code analysis
|
92
|
+
```
|
93
|
+
|
94
|
+
### System Requirements
|
95
|
+
- **Python 3.8+**
|
96
|
+
- **Local LLM Model**: Deepseek Coder or compatible GGUF model
|
97
|
+
- **GitHub Token**: For enhanced GitHub API access (optional but recommended)
|
98
|
+
- **Disk Space**: ~1GB free space for processing and caching
|
99
|
+
|
100
|
+
## 🛠️ Installation
|
101
|
+
|
102
|
+
### 1. Clone the Repository
|
103
|
+
```bash
|
104
|
+
git clone <repository-url>
|
105
|
+
cd enrichment-flow2
|
106
|
+
```
|
107
|
+
|
108
|
+
### 2. Install Dependencies
|
109
|
+
```bash
|
110
|
+
pip install -r requirements.txt
|
111
|
+
```
|
112
|
+
|
113
|
+
### 3. Download LLM Model
|
114
|
+
```bash
|
115
|
+
# Example: Download Deepseek Coder model
|
116
|
+
mkdir -p ~/models/deepseek/
|
117
|
+
wget https://huggingface.co/TheBloke/deepseek-coder-6.7B-instruct-GGUF/resolve/main/deepseek-coder-6.7b-instruct.Q4_K_M.gguf \
|
118
|
+
-O ~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf
|
119
|
+
```
|
120
|
+
|
121
|
+
### 4. Set Environment Variables (Optional)
|
122
|
+
```bash
|
123
|
+
export GITHUB_TOKEN="your_github_token_here"
|
124
|
+
```
|
125
|
+
|
126
|
+
## 🚀 Quick Start
|
127
|
+
|
128
|
+
### Installation
|
129
|
+
|
130
|
+
#### From PyPI (Recommended)
|
131
|
+
```bash
|
132
|
+
pip install rust-crate-pipeline
|
133
|
+
```
|
134
|
+
|
135
|
+
#### From Source
|
136
|
+
```bash
|
137
|
+
git clone https://github.com/DaveTmire85/SigilDERG-Data_Production.git
|
138
|
+
cd SigilDERG-Data_Production
|
139
|
+
pip install -e .
|
140
|
+
```
|
141
|
+
|
142
|
+
#### Development Installation
|
143
|
+
```bash
|
144
|
+
git clone https://github.com/DaveTmire85/SigilDERG-Data_Production.git
|
145
|
+
cd SigilDERG-Data_Production
|
146
|
+
pip install -e ".[dev]"
|
147
|
+
```
|
148
|
+
|
149
|
+
### Basic Usage
|
150
|
+
```bash
|
151
|
+
# Run with default settings
|
152
|
+
python -m rust_crate_pipeline
|
153
|
+
|
154
|
+
# Process only 20 crates for testing
|
155
|
+
python -m rust_crate_pipeline --limit 20
|
156
|
+
|
157
|
+
# Skip AI processing for faster metadata-only collection
|
158
|
+
python -m rust_crate_pipeline --skip-ai --limit 50
|
159
|
+
```
|
160
|
+
|
161
|
+
### Advanced Usage
|
162
|
+
```bash
|
163
|
+
# Custom configuration
|
164
|
+
python -m rust_crate_pipeline \
|
165
|
+
--limit 100 \
|
166
|
+
--batch-size 5 \
|
167
|
+
--workers 2 \
|
168
|
+
--log-level DEBUG \
|
169
|
+
--output-dir ./results
|
170
|
+
|
171
|
+
# Process specific crates
|
172
|
+
python -m rust_crate_pipeline \
|
173
|
+
--crate-list serde tokio actix-web reqwest \
|
174
|
+
--output-dir ./specific_crates
|
175
|
+
|
176
|
+
# Use custom model and config
|
177
|
+
python -m rust_crate_pipeline \
|
178
|
+
--model-path ./my-model.gguf \
|
179
|
+
--config-file ./custom_config.json
|
180
|
+
```
|
181
|
+
|
182
|
+
## 📁 Project Structure
|
183
|
+
|
184
|
+
```
|
185
|
+
enrichment-flow2/
|
186
|
+
├── __init__.py # Package initialization and public API
|
187
|
+
├── __main__.py # Entry point for python -m execution
|
188
|
+
├── main.py # CLI interface and main execution logic
|
189
|
+
├── config.py # Configuration classes and data models
|
190
|
+
├── pipeline.py # Main orchestration and workflow management
|
191
|
+
├── ai_processing.py # LLM integration and AI-powered enrichment
|
192
|
+
├── network.py # API clients and HTTP request handling
|
193
|
+
├── analysis.py # Source code, security, and dependency analysis
|
194
|
+
└── utils/ # Utility functions
|
195
|
+
├── logging_utils.py # Logging configuration and decorators
|
196
|
+
└── file_utils.py # File operations and disk management
|
197
|
+
```
|
198
|
+
|
199
|
+
## ⚙️ Configuration
|
200
|
+
|
201
|
+
### Command Line Arguments
|
202
|
+
|
203
|
+
| Argument | Type | Default | Description |
|
204
|
+
|----------|------|---------|-------------|
|
205
|
+
| `--limit` | int | None | Limit number of crates to process |
|
206
|
+
| `--batch-size` | int | 10 | Crates processed per batch |
|
207
|
+
| `--workers` | int | 4 | Parallel workers for API requests |
|
208
|
+
| `--output-dir` | str | auto | Custom output directory |
|
209
|
+
| `--model-path` | str | default | Path to LLM model file |
|
210
|
+
| `--max-tokens` | int | 256 | Maximum tokens for LLM generation |
|
211
|
+
| `--checkpoint-interval` | int | 10 | Save progress every N crates |
|
212
|
+
| `--log-level` | str | INFO | Logging verbosity |
|
213
|
+
| `--skip-ai` | flag | False | Skip AI enrichment |
|
214
|
+
| `--skip-source-analysis` | flag | False | Skip source code analysis |
|
215
|
+
| `--crate-list` | list | None | Specific crates to process |
|
216
|
+
| `--config-file` | str | None | JSON configuration file |
|
217
|
+
|
218
|
+
### Configuration File Example
|
219
|
+
```json
|
220
|
+
{
|
221
|
+
"model_path": "/path/to/your/model.gguf",
|
222
|
+
"batch_size": 5,
|
223
|
+
"n_workers": 2,
|
224
|
+
"max_tokens": 512,
|
225
|
+
"checkpoint_interval": 5,
|
226
|
+
"github_token": "ghp_your_token_here",
|
227
|
+
"cache_ttl": 7200
|
228
|
+
}
|
229
|
+
```
|
230
|
+
|
231
|
+
## 📊 Output Format
|
232
|
+
|
233
|
+
The pipeline generates several output files:
|
234
|
+
|
235
|
+
### 1. **Enriched Metadata** (`enriched_crate_metadata_TIMESTAMP.jsonl`)
|
236
|
+
```json
|
237
|
+
{
|
238
|
+
"name": "serde",
|
239
|
+
"version": "1.0.193",
|
240
|
+
"description": "A generic serialization/deserialization framework",
|
241
|
+
"use_case": "Serialization",
|
242
|
+
"score": 8542.3,
|
243
|
+
"feature_summary": "Provides derive macros for automatic serialization...",
|
244
|
+
"factual_counterfactual": "✅ Factual: Serde supports JSON serialization...",
|
245
|
+
"source_analysis": {
|
246
|
+
"file_count": 45,
|
247
|
+
"loc": 12500,
|
248
|
+
"functions": ["serialize", "deserialize", ...],
|
249
|
+
"has_tests": true
|
250
|
+
}
|
251
|
+
}
|
252
|
+
```
|
253
|
+
|
254
|
+
### 2. **Dependency Analysis** (`dependency_analysis_TIMESTAMP.json`)
|
255
|
+
```json
|
256
|
+
{
|
257
|
+
"dependency_graph": {
|
258
|
+
"actix-web": ["tokio", "serde", "futures"],
|
259
|
+
"tokio": ["mio", "parking_lot"]
|
260
|
+
},
|
261
|
+
"reverse_dependencies": {
|
262
|
+
"serde": ["actix-web", "reqwest", "clap"],
|
263
|
+
"tokio": ["actix-web", "reqwest"]
|
264
|
+
},
|
265
|
+
"most_depended": [
|
266
|
+
["serde", 156],
|
267
|
+
["tokio", 98]
|
268
|
+
]
|
269
|
+
}
|
270
|
+
```
|
271
|
+
|
272
|
+
### 3. **Summary Report** (`summary_report_TIMESTAMP.json`)
|
273
|
+
```json
|
274
|
+
{
|
275
|
+
"total_crates": 150,
|
276
|
+
"total_time": "1247.32s",
|
277
|
+
"timestamp": "2025-06-18T10:30:00",
|
278
|
+
"most_popular": [
|
279
|
+
{"name": "serde", "score": 8542.3},
|
280
|
+
{"name": "tokio", "score": 7234.1}
|
281
|
+
]
|
282
|
+
}
|
283
|
+
```
|
284
|
+
|
285
|
+
## 🔧 Advanced Features
|
286
|
+
|
287
|
+
### Custom Crate Lists
|
288
|
+
Process specific crates by providing a custom list:
|
289
|
+
```bash
|
290
|
+
python -m rust_crate_pipeline --crate-list \
|
291
|
+
serde tokio actix-web reqwest clap \
|
292
|
+
--output-dir ./web_framework_analysis
|
293
|
+
```
|
294
|
+
|
295
|
+
### Performance Tuning
|
296
|
+
Optimize for your system:
|
297
|
+
```bash
|
298
|
+
# High-performance setup (good internet, powerful machine)
|
299
|
+
python -m rust_crate_pipeline --batch-size 20 --workers 8
|
300
|
+
|
301
|
+
# Conservative setup (limited resources)
|
302
|
+
python -m rust_crate_pipeline --batch-size 3 --workers 1
|
303
|
+
```
|
304
|
+
|
305
|
+
### Development Mode
|
306
|
+
Quick testing with minimal processing:
|
307
|
+
```bash
|
308
|
+
python -m rust_crate_pipeline \
|
309
|
+
--limit 5 \
|
310
|
+
--skip-ai \
|
311
|
+
--skip-source-analysis \
|
312
|
+
--log-level DEBUG
|
313
|
+
```
|
314
|
+
|
315
|
+
## 🏗️ Architecture
|
316
|
+
|
317
|
+
### Core Components
|
318
|
+
|
319
|
+
1. **CrateDataPipeline**: Main orchestration class that coordinates all processing
|
320
|
+
2. **LLMEnricher**: Handles AI-powered enrichment using local LLM models
|
321
|
+
3. **CrateAPIClient**: Manages API interactions with crates.io and fallback sources
|
322
|
+
4. **GitHubBatchClient**: Optimized GitHub API client with rate limiting
|
323
|
+
5. **SourceAnalyzer**: Analyzes source code metrics and complexity
|
324
|
+
6. **SecurityAnalyzer**: Checks for security vulnerabilities and patterns
|
325
|
+
7. **UserBehaviorAnalyzer**: Tracks community engagement and version adoption
|
326
|
+
8. **DependencyAnalyzer**: Builds and analyzes dependency relationships
|
327
|
+
|
328
|
+
### Processing Flow
|
329
|
+
|
330
|
+
```
|
331
|
+
1. Crate Discovery → 2. Metadata Fetching → 3. AI Enrichment
|
332
|
+
↓ ↓ ↓
|
333
|
+
4. Source Analysis → 5. Security Scanning → 6. Community Analysis
|
334
|
+
↓ ↓ ↓
|
335
|
+
7. Dependency Mapping → 8. Data Aggregation → 9. Report Generation
|
336
|
+
```
|
337
|
+
|
338
|
+
## 🧪 API Usage
|
339
|
+
|
340
|
+
### Programmatic Usage
|
341
|
+
```python
|
342
|
+
from rust_crate_pipeline import CrateDataPipeline, PipelineConfig
|
343
|
+
|
344
|
+
# Create custom configuration
|
345
|
+
config = PipelineConfig(
|
346
|
+
batch_size=5,
|
347
|
+
max_tokens=512,
|
348
|
+
model_path="/path/to/model.gguf"
|
349
|
+
)
|
350
|
+
|
351
|
+
# Initialize and run pipeline
|
352
|
+
pipeline = CrateDataPipeline(config)
|
353
|
+
pipeline.run()
|
354
|
+
|
355
|
+
# Or use individual components
|
356
|
+
from rust_crate_pipeline import LLMEnricher, SourceAnalyzer
|
357
|
+
|
358
|
+
enricher = LLMEnricher(config)
|
359
|
+
analyzer = SourceAnalyzer()
|
360
|
+
```
|
361
|
+
|
362
|
+
### Custom Processing
|
363
|
+
```python
|
364
|
+
# Process specific crates with custom options
|
365
|
+
pipeline = CrateDataPipeline(
|
366
|
+
config,
|
367
|
+
limit=50,
|
368
|
+
crate_list=["serde", "tokio", "actix-web"],
|
369
|
+
skip_ai=False,
|
370
|
+
output_dir="./custom_analysis"
|
371
|
+
)
|
372
|
+
```
|
373
|
+
|
374
|
+
## 🐛 Troubleshooting
|
375
|
+
|
376
|
+
### Common Issues
|
377
|
+
|
378
|
+
**🔴 Model Loading Errors**
|
379
|
+
```bash
|
380
|
+
# Verify model path
|
381
|
+
ls -la ~/models/deepseek/deepseek-coder-6.7b-instruct.Q4_K_M.gguf
|
382
|
+
|
383
|
+
# Check model format compatibility
|
384
|
+
python -c "from llama_cpp import Llama; print('Model loading OK')"
|
385
|
+
```
|
386
|
+
|
387
|
+
**🔴 API Rate Limiting**
|
388
|
+
```bash
|
389
|
+
# Set GitHub token for higher rate limits
|
390
|
+
export GITHUB_TOKEN="your_token_here"
|
391
|
+
|
392
|
+
# Reduce batch size and workers
|
393
|
+
python -m rust_crate_pipeline --batch-size 3 --workers 1
|
394
|
+
```
|
395
|
+
|
396
|
+
**🔴 Memory Issues**
|
397
|
+
```bash
|
398
|
+
# Reduce token limits and batch size
|
399
|
+
python -m rust_crate_pipeline --max-tokens 128 --batch-size 2
|
400
|
+
```
|
401
|
+
|
402
|
+
**🔴 Network Timeouts**
|
403
|
+
```bash
|
404
|
+
# Enable debug logging to identify issues
|
405
|
+
python -m rust_crate_pipeline --log-level DEBUG --limit 10
|
406
|
+
```
|
407
|
+
|
408
|
+
### Performance Optimization
|
409
|
+
|
410
|
+
1. **Use SSD storage** for faster caching and temporary file operations
|
411
|
+
2. **Increase RAM** if processing large batches (recommended: 8GB+)
|
412
|
+
3. **Set GITHUB_TOKEN** for 5000 req/hour instead of 60 req/hour
|
413
|
+
4. **Use appropriate batch sizes** based on your internet connection
|
414
|
+
5. **Monitor disk space** - processing can generate several GB of data
|
415
|
+
|
416
|
+
## 📈 Performance Metrics
|
417
|
+
|
418
|
+
### Typical Processing Times
|
419
|
+
- **Metadata only**: ~2-3 seconds per crate
|
420
|
+
- **With AI enrichment**: ~15-30 seconds per crate
|
421
|
+
- **Full analysis**: ~45-60 seconds per crate
|
422
|
+
|
423
|
+
### Resource Usage
|
424
|
+
- **Memory**: 2-4GB during processing
|
425
|
+
- **Disk**: 10-50MB per crate (temporary files)
|
426
|
+
- **Network**: ~1-5MB per crate (API calls)
|
427
|
+
|
428
|
+
## 🤝 Contributing
|
429
|
+
|
430
|
+
### Development Setup
|
431
|
+
```bash
|
432
|
+
# Clone repository
|
433
|
+
git clone <repository-url>
|
434
|
+
cd enrichment-flow2
|
435
|
+
|
436
|
+
# Install development dependencies
|
437
|
+
pip install -r requirements-dev.txt
|
438
|
+
|
439
|
+
# Run tests
|
440
|
+
python -m pytest tests/
|
441
|
+
|
442
|
+
# Format code
|
443
|
+
black . && isort .
|
444
|
+
```
|
445
|
+
|
446
|
+
### Adding New Analysis Features
|
447
|
+
1. Implement new analyzer in `analysis.py`
|
448
|
+
2. Add configuration options to `config.py`
|
449
|
+
3. Integrate with pipeline in `pipeline.py`
|
450
|
+
4. Add CLI arguments in `main.py`
|
451
|
+
5. Update documentation
|
452
|
+
|
453
|
+
## 📄 License
|
454
|
+
|
455
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
456
|
+
|
457
|
+
## 🙏 Acknowledgments
|
458
|
+
|
459
|
+
- **Rust Community** for the excellent crates ecosystem
|
460
|
+
- **crates.io** for providing comprehensive API access
|
461
|
+
- **GitHub** for repository metadata and community data
|
462
|
+
- **Deepseek** for the powerful code-focused language model
|
463
|
+
- **llama.cpp** team for efficient local inference capabilities
|
464
|
+
|
465
|
+
## 📞 Support
|
466
|
+
|
467
|
+
- **Issues**: [GitHub Issues](https://github.com/your-repo/issues)
|
468
|
+
- **Discussions**: [GitHub Discussions](https://github.com/your-repo/discussions)
|
469
|
+
- **Documentation**: [Wiki](https://github.com/your-repo/wiki)
|
470
|
+
|
471
|
+
---
|
472
|
+
|
473
|
+
**Happy crate analyzing! 🦀✨**
|
@@ -0,0 +1,93 @@
|
|
1
|
+
# Publishing to PyPI
|
2
|
+
|
3
|
+
This document explains how to publish this package to PyPI.
|
4
|
+
|
5
|
+
## Prerequisites
|
6
|
+
|
7
|
+
1. Install build and twine:
|
8
|
+
```bash
|
9
|
+
pip install build twine
|
10
|
+
```
|
11
|
+
|
12
|
+
2. Create accounts on:
|
13
|
+
- [PyPI](https://pypi.org/account/register/) (production)
|
14
|
+
- [TestPyPI](https://test.pypi.org/account/register/) (testing)
|
15
|
+
|
16
|
+
## Building the Package
|
17
|
+
|
18
|
+
Build the distribution files:
|
19
|
+
```bash
|
20
|
+
python -m build
|
21
|
+
```
|
22
|
+
|
23
|
+
This creates:
|
24
|
+
- `dist/rust-crate-pipeline-X.X.X.tar.gz` (source distribution)
|
25
|
+
- `dist/rust_crate_pipeline-X.X.X-py3-none-any.whl` (wheel)
|
26
|
+
|
27
|
+
## Testing on TestPyPI
|
28
|
+
|
29
|
+
First, test on TestPyPI:
|
30
|
+
|
31
|
+
```bash
|
32
|
+
# Upload to TestPyPI
|
33
|
+
python -m twine upload --repository testpypi dist/*
|
34
|
+
|
35
|
+
# Install from TestPyPI to test
|
36
|
+
pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ rust-crate-pipeline
|
37
|
+
```
|
38
|
+
|
39
|
+
## Publishing to PyPI
|
40
|
+
|
41
|
+
Once tested, publish to the real PyPI:
|
42
|
+
|
43
|
+
```bash
|
44
|
+
python -m twine upload dist/*
|
45
|
+
```
|
46
|
+
|
47
|
+
## Automation with GitHub Actions
|
48
|
+
|
49
|
+
Consider setting up GitHub Actions for automated publishing. Create `.github/workflows/publish.yml`:
|
50
|
+
|
51
|
+
```yaml
|
52
|
+
name: Publish to PyPI
|
53
|
+
|
54
|
+
on:
|
55
|
+
release:
|
56
|
+
types: [published]
|
57
|
+
|
58
|
+
jobs:
|
59
|
+
publish:
|
60
|
+
runs-on: ubuntu-latest
|
61
|
+
steps:
|
62
|
+
- uses: actions/checkout@v3
|
63
|
+
- name: Set up Python
|
64
|
+
uses: actions/setup-python@v4
|
65
|
+
with:
|
66
|
+
python-version: '3.8'
|
67
|
+
- name: Install dependencies
|
68
|
+
run: |
|
69
|
+
python -m pip install --upgrade pip
|
70
|
+
pip install build twine
|
71
|
+
- name: Build package
|
72
|
+
run: python -m build
|
73
|
+
- name: Publish to PyPI
|
74
|
+
env:
|
75
|
+
TWINE_USERNAME: __token__
|
76
|
+
TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
|
77
|
+
run: twine upload dist/*
|
78
|
+
```
|
79
|
+
|
80
|
+
## Version Management
|
81
|
+
|
82
|
+
Update version numbers in:
|
83
|
+
- `pyproject.toml`
|
84
|
+
- `setup.py`
|
85
|
+
- `rust_crate_pipeline/__init__.py`
|
86
|
+
|
87
|
+
## API Token Setup
|
88
|
+
|
89
|
+
For automated publishing, use API tokens instead of username/password:
|
90
|
+
|
91
|
+
1. Go to PyPI account settings
|
92
|
+
2. Generate an API token
|
93
|
+
3. Use `__token__` as username and the token as password
|