rust-crate-pipeline 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rust_crate_pipeline/__init__.py +18 -27
- rust_crate_pipeline/__main__.py +1 -0
- rust_crate_pipeline/ai_processing.py +718 -596
- rust_crate_pipeline/analysis.py +330 -363
- rust_crate_pipeline/azure_ai_processing.py +462 -0
- rust_crate_pipeline/config.py +46 -28
- rust_crate_pipeline/core/__init__.py +19 -0
- rust_crate_pipeline/core/canon_registry.py +133 -0
- rust_crate_pipeline/core/irl_engine.py +256 -0
- rust_crate_pipeline/core/sacred_chain.py +117 -0
- rust_crate_pipeline/crate_analysis.py +54 -0
- rust_crate_pipeline/crate_list.txt +424 -0
- rust_crate_pipeline/github_token_checker.py +108 -112
- rust_crate_pipeline/main.py +329 -109
- rust_crate_pipeline/network.py +317 -308
- rust_crate_pipeline/pipeline.py +300 -375
- rust_crate_pipeline/production_config.py +24 -27
- rust_crate_pipeline/progress_monitor.py +334 -0
- rust_crate_pipeline/scraping/__init__.py +13 -0
- rust_crate_pipeline/scraping/unified_scraper.py +259 -0
- rust_crate_pipeline/unified_llm_processor.py +637 -0
- rust_crate_pipeline/unified_pipeline.py +548 -0
- rust_crate_pipeline/utils/file_utils.py +32 -5
- rust_crate_pipeline/utils/logging_utils.py +21 -16
- rust_crate_pipeline/version.py +76 -47
- rust_crate_pipeline-1.4.1.dist-info/METADATA +515 -0
- rust_crate_pipeline-1.4.1.dist-info/RECORD +31 -0
- rust_crate_pipeline-1.4.0.dist-info/METADATA +0 -585
- rust_crate_pipeline-1.4.0.dist-info/RECORD +0 -19
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/WHEEL +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/entry_points.txt +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/licenses/LICENSE +0 -0
- {rust_crate_pipeline-1.4.0.dist-info → rust_crate_pipeline-1.4.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,515 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: rust-crate-pipeline
|
3
|
+
Version: 1.4.1
|
4
|
+
Summary: A comprehensive pipeline for analyzing Rust crates with AI enrichment and enhanced scraping
|
5
|
+
Home-page: https://github.com/SigilDERG/rust-crate-pipeline
|
6
|
+
Author: SigilDERG Team
|
7
|
+
Author-email: SigilDERG Team <sigilderg@example.com>
|
8
|
+
License: MIT
|
9
|
+
Project-URL: Homepage, https://github.com/Superuser666-Sigil/SigilDERG-Data_Production
|
10
|
+
Project-URL: Documentation, https://github.com/Superuser666-Sigil/SigilDERG-Data_Production#readme
|
11
|
+
Project-URL: Repository, https://github.com/Superuser666-Sigil/SigilDERG-Data_Production
|
12
|
+
Project-URL: Bug Tracker, https://github.com/Superuser666-Sigil/SigilDERG-Data_Production/issues
|
13
|
+
Keywords: rust,crates,analysis,ai,pipeline,scraping
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
15
|
+
Classifier: Intended Audience :: Developers
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
21
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
22
|
+
Requires-Python: >=3.12
|
23
|
+
Description-Content-Type: text/markdown
|
24
|
+
License-File: LICENSE
|
25
|
+
Requires-Dist: requests>=2.28.0
|
26
|
+
Requires-Dist: requests-cache>=1.0.0
|
27
|
+
Requires-Dist: beautifulsoup4>=4.11.0
|
28
|
+
Requires-Dist: crawl4ai>=0.6.0
|
29
|
+
Requires-Dist: playwright>=1.49.0
|
30
|
+
Requires-Dist: tqdm>=4.64.0
|
31
|
+
Requires-Dist: llama-cpp-python>=0.2.0
|
32
|
+
Requires-Dist: tiktoken>=0.5.0
|
33
|
+
Requires-Dist: psutil>=5.9.0
|
34
|
+
Requires-Dist: python-dateutil>=2.8.0
|
35
|
+
Requires-Dist: litellm>=1.0.0
|
36
|
+
Provides-Extra: dev
|
37
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
38
|
+
Requires-Dist: black>=22.0.0; extra == "dev"
|
39
|
+
Requires-Dist: isort>=5.10.0; extra == "dev"
|
40
|
+
Provides-Extra: advanced
|
41
|
+
Requires-Dist: radon>=6.0.0; extra == "advanced"
|
42
|
+
Requires-Dist: rustworkx>=0.13.0; extra == "advanced"
|
43
|
+
Dynamic: author
|
44
|
+
Dynamic: home-page
|
45
|
+
Dynamic: license-file
|
46
|
+
Dynamic: requires-python
|
47
|
+
|
48
|
+
# Rust Crate Pipeline
|
49
|
+
|
50
|
+
A comprehensive system for gathering, enriching, and analyzing metadata for Rust crates using AI-powered insights, web scraping, and dependency analysis.
|
51
|
+
|
52
|
+
## Overview
|
53
|
+
|
54
|
+
The Rust Crate Pipeline is designed to collect, process, and enrich metadata from Rust crates available on crates.io. It combines web scraping, AI-powered analysis, and cargo testing to provide comprehensive insights into Rust ecosystem packages.
|
55
|
+
|
56
|
+
## Features
|
57
|
+
|
58
|
+
- **Enhanced Web Scraping**: Automated collection of crate metadata from crates.io using Crawl4AI with Playwright
|
59
|
+
- **AI Enrichment**: Local and Azure OpenAI-powered analysis of crate descriptions, features, and documentation
|
60
|
+
- **Multi-Provider LLM Support**: Unified LLM processor supporting OpenAI, Azure OpenAI, Ollama, LM Studio, and LiteLLM
|
61
|
+
- **Cargo Testing**: Automated cargo build, test, and audit execution for comprehensive crate analysis
|
62
|
+
- **Dependency Analysis**: Deep analysis of crate dependencies and their relationships
|
63
|
+
- **Batch Processing**: Efficient processing of multiple crates with configurable batch sizes
|
64
|
+
- **Data Export**: Structured output in JSON format for further analysis
|
65
|
+
- **RAG Cache**: Intelligent caching with Rule Zero policies and architectural patterns
|
66
|
+
- **Docker Support**: Containerized deployment with optimized Docker configurations
|
67
|
+
- **Real-time Progress Monitoring**: CLI-based progress tracking with ASCII status indicators
|
68
|
+
- **Cross-platform Compatibility**: Full Unicode symbol replacement for better encoding support
|
69
|
+
|
70
|
+
## Requirements
|
71
|
+
|
72
|
+
- **Python 3.12+**: Required for modern type annotations and language features
|
73
|
+
- **Git**: For cloning repositories during analysis
|
74
|
+
- **Cargo**: For Rust crate testing and analysis
|
75
|
+
- **Playwright**: Automatically installed for enhanced web scraping
|
76
|
+
|
77
|
+
## Installation
|
78
|
+
|
79
|
+
```bash
|
80
|
+
# Clone the repository
|
81
|
+
git clone https://github.com/Superuser666-Sigil/SigilDERG-Data_Production.git
|
82
|
+
cd SigilDERG-Data_Production
|
83
|
+
|
84
|
+
# Install in development mode (includes all dependencies)
|
85
|
+
pip install -e .
|
86
|
+
|
87
|
+
# Install Playwright browsers for enhanced scraping
|
88
|
+
playwright install
|
89
|
+
```
|
90
|
+
|
91
|
+
### Automatic Dependency Installation
|
92
|
+
|
93
|
+
The package automatically installs all required dependencies including:
|
94
|
+
- `crawl4ai` for web scraping
|
95
|
+
- `playwright` for enhanced browser automation
|
96
|
+
- `requests` for HTTP requests
|
97
|
+
- `aiohttp` for async operations
|
98
|
+
- And all other required packages
|
99
|
+
|
100
|
+
## Configuration
|
101
|
+
|
102
|
+
### Environment Variables
|
103
|
+
|
104
|
+
Set the following environment variables for full functionality:
|
105
|
+
|
106
|
+
```bash
|
107
|
+
# GitHub Personal Access Token (required for API access)
|
108
|
+
export GITHUB_TOKEN="your_github_token_here"
|
109
|
+
|
110
|
+
# Azure OpenAI (optional, for cloud AI processing)
|
111
|
+
export AZURE_OPENAI_ENDPOINT="https://your-resource.openai.azure.com/"
|
112
|
+
export AZURE_OPENAI_API_KEY="your_azure_openai_key"
|
113
|
+
export AZURE_OPENAI_DEPLOYMENT_NAME="your_deployment_name"
|
114
|
+
export AZURE_OPENAI_API_VERSION="2024-02-15-preview"
|
115
|
+
|
116
|
+
# PyPI API Token (optional, for publishing)
|
117
|
+
export PYPI_API_TOKEN="your_pypi_token"
|
118
|
+
|
119
|
+
# LiteLLM Configuration (optional, for multi-provider LLM support)
|
120
|
+
export LITELLM_MODEL="deepseek-coder:33b"
|
121
|
+
export LITELLM_BASE_URL="http://localhost:11434" # For Ollama
|
122
|
+
```
|
123
|
+
|
124
|
+
### Configuration File
|
125
|
+
|
126
|
+
Create a `config.json` file for custom settings:
|
127
|
+
|
128
|
+
```json
|
129
|
+
{
|
130
|
+
"batch_size": 10,
|
131
|
+
"n_workers": 4,
|
132
|
+
"max_retries": 3,
|
133
|
+
"checkpoint_interval": 10,
|
134
|
+
"use_azure_openai": true,
|
135
|
+
"crawl4ai_config": {
|
136
|
+
"max_pages": 5,
|
137
|
+
"concurrency": 2
|
138
|
+
}
|
139
|
+
}
|
140
|
+
```
|
141
|
+
|
142
|
+
## Usage
|
143
|
+
|
144
|
+
### Command Line Interface
|
145
|
+
|
146
|
+
#### Basic Usage
|
147
|
+
|
148
|
+
```bash
|
149
|
+
# Run with default settings
|
150
|
+
python -m rust_crate_pipeline
|
151
|
+
|
152
|
+
# Run with custom batch size
|
153
|
+
python -m rust_crate_pipeline --batch-size 20
|
154
|
+
|
155
|
+
# Run with specific workers
|
156
|
+
python -m rust_crate_pipeline --workers 8
|
157
|
+
|
158
|
+
# Use configuration file
|
159
|
+
python -m rust_crate_pipeline --config-file config.json
|
160
|
+
```
|
161
|
+
|
162
|
+
#### Advanced Options
|
163
|
+
|
164
|
+
```bash
|
165
|
+
# Enable Azure OpenAI processing
|
166
|
+
python -m rust_crate_pipeline --enable-azure-openai
|
167
|
+
|
168
|
+
# Set custom model path for local AI
|
169
|
+
python -m rust_crate_pipeline --model-path /path/to/model.gguf
|
170
|
+
|
171
|
+
# Configure token limits
|
172
|
+
python -m rust_crate_pipeline --max-tokens 2048
|
173
|
+
|
174
|
+
# Set checkpoint interval
|
175
|
+
python -m rust_crate_pipeline --checkpoint-interval 5
|
176
|
+
|
177
|
+
# Enable verbose logging
|
178
|
+
python -m rust_crate_pipeline --log-level DEBUG
|
179
|
+
|
180
|
+
# Enable enhanced scraping with Playwright
|
181
|
+
python -m rust_crate_pipeline --enable-enhanced-scraping
|
182
|
+
|
183
|
+
# Set output directory for results
|
184
|
+
python -m rust_crate_pipeline --output-path ./results
|
185
|
+
```
|
186
|
+
|
187
|
+
#### Enhanced Scraping
|
188
|
+
|
189
|
+
The pipeline now supports enhanced web scraping using Playwright for better data extraction:
|
190
|
+
|
191
|
+
```bash
|
192
|
+
# Enable enhanced scraping (default)
|
193
|
+
python -m rust_crate_pipeline --enable-enhanced-scraping
|
194
|
+
|
195
|
+
# Use basic scraping only
|
196
|
+
python -m rust_crate_pipeline --disable-enhanced-scraping
|
197
|
+
|
198
|
+
# Configure scraping options
|
199
|
+
python -m rust_crate_pipeline --scraping-config '{"max_pages": 10, "concurrency": 3}'
|
200
|
+
```
|
201
|
+
|
202
|
+
#### Multi-Provider LLM Support
|
203
|
+
|
204
|
+
```bash
|
205
|
+
# Use OpenAI
|
206
|
+
python -m rust_crate_pipeline.unified_llm_processor --provider openai --model-name gpt-4
|
207
|
+
|
208
|
+
# Use Azure OpenAI
|
209
|
+
python -m rust_crate_pipeline.unified_llm_processor --provider azure --model-name gpt-4
|
210
|
+
|
211
|
+
# Use Ollama (local)
|
212
|
+
python -m rust_crate_pipeline.unified_llm_processor --provider ollama --model-name deepseek-coder:33b
|
213
|
+
|
214
|
+
# Use LM Studio
|
215
|
+
python -m rust_crate_pipeline.unified_llm_processor --provider openai --base-url http://localhost:1234/v1 --model-name local-model
|
216
|
+
|
217
|
+
# Use LiteLLM
|
218
|
+
python -m rust_crate_pipeline.unified_llm_processor --provider litellm --model-name deepseek-coder:33b
|
219
|
+
```
|
220
|
+
|
221
|
+
#### Production Mode
|
222
|
+
|
223
|
+
```bash
|
224
|
+
# Run production pipeline with optimizations
|
225
|
+
python run_production.py
|
226
|
+
|
227
|
+
# Run with Sigil Protocol integration
|
228
|
+
python -m rust_crate_pipeline --enable-sigil-protocol
|
229
|
+
```
|
230
|
+
|
231
|
+
### Programmatic Usage
|
232
|
+
|
233
|
+
```python
|
234
|
+
from rust_crate_pipeline import CrateDataPipeline
|
235
|
+
from rust_crate_pipeline.config import PipelineConfig
|
236
|
+
|
237
|
+
# Create configuration
|
238
|
+
config = PipelineConfig(
|
239
|
+
batch_size=10,
|
240
|
+
n_workers=4,
|
241
|
+
use_azure_openai=True
|
242
|
+
)
|
243
|
+
|
244
|
+
# Initialize pipeline
|
245
|
+
pipeline = CrateDataPipeline(config)
|
246
|
+
|
247
|
+
# Run pipeline
|
248
|
+
import asyncio
|
249
|
+
result = asyncio.run(pipeline.run())
|
250
|
+
```
|
251
|
+
|
252
|
+
## Sample Data
|
253
|
+
|
254
|
+
### Input: Crate List
|
255
|
+
|
256
|
+
The pipeline processes crates from `rust_crate_pipeline/crate_list.txt`:
|
257
|
+
|
258
|
+
```
|
259
|
+
tokio
|
260
|
+
serde
|
261
|
+
reqwest
|
262
|
+
actix-web
|
263
|
+
clap
|
264
|
+
```
|
265
|
+
|
266
|
+
### Output: Enriched Crate Data
|
267
|
+
|
268
|
+
```json
|
269
|
+
{
|
270
|
+
"name": "tokio",
|
271
|
+
"version": "1.35.1",
|
272
|
+
"description": "An asynchronous runtime for Rust",
|
273
|
+
"downloads": 125000000,
|
274
|
+
"github_stars": 21500,
|
275
|
+
"keywords": ["async", "runtime", "tokio", "futures"],
|
276
|
+
"categories": ["asynchronous", "network-programming"],
|
277
|
+
"features": {
|
278
|
+
"full": ["all features enabled"],
|
279
|
+
"rt": ["runtime features"],
|
280
|
+
"macros": ["macro support"]
|
281
|
+
},
|
282
|
+
"readme_summary": "Tokio is an asynchronous runtime for Rust that provides the building blocks for writing network applications.",
|
283
|
+
"use_case": "Networking",
|
284
|
+
"factual_counterfactual": "✅ Factual: Tokio provides async I/O primitives\n❌ Counterfactual: Tokio is a synchronous runtime",
|
285
|
+
"score": 9.5,
|
286
|
+
"cargo_test_results": {
|
287
|
+
"build_success": true,
|
288
|
+
"test_success": true,
|
289
|
+
"audit_clean": true,
|
290
|
+
"dependencies": 45
|
291
|
+
},
|
292
|
+
"ai_insights": {
|
293
|
+
"complexity": "High",
|
294
|
+
"maturity": "Production Ready",
|
295
|
+
"community_health": "Excellent"
|
296
|
+
}
|
297
|
+
}
|
298
|
+
```
|
299
|
+
|
300
|
+
## Architecture
|
301
|
+
|
302
|
+
### Core Components
|
303
|
+
|
304
|
+
- **Pipeline Orchestrator**: Manages the overall data processing workflow
|
305
|
+
- **Web Scraper**: Collects crate metadata using Crawl4AI
|
306
|
+
- **AI Enricher**: Enhances data with local or cloud AI analysis
|
307
|
+
- **Cargo Analyzer**: Executes cargo commands for comprehensive testing
|
308
|
+
- **Data Exporter**: Outputs structured results in various formats
|
309
|
+
|
310
|
+
### Data Flow
|
311
|
+
|
312
|
+
1. **Input**: Crate names from `crate_list.txt`
|
313
|
+
2. **Scraping**: Web scraping of crates.io for metadata
|
314
|
+
3. **Enrichment**: AI-powered analysis and insights
|
315
|
+
4. **Testing**: Cargo build, test, and audit execution
|
316
|
+
5. **Output**: Structured JSON with comprehensive crate analysis
|
317
|
+
|
318
|
+
## Development
|
319
|
+
|
320
|
+
### Prerequisites
|
321
|
+
|
322
|
+
- Python 3.12+ (required for modern type annotations)
|
323
|
+
- Git for version control
|
324
|
+
- Cargo for Rust crate testing
|
325
|
+
|
326
|
+
### Running Tests
|
327
|
+
|
328
|
+
```bash
|
329
|
+
# Run all tests
|
330
|
+
pytest tests/
|
331
|
+
|
332
|
+
# Run specific test module
|
333
|
+
pytest tests/test_main_integration.py
|
334
|
+
|
335
|
+
# Run with coverage
|
336
|
+
pytest --cov=rust_crate_pipeline tests/
|
337
|
+
|
338
|
+
# Run type checking
|
339
|
+
pyright rust_crate_pipeline/
|
340
|
+
|
341
|
+
# Run linting
|
342
|
+
flake8 rust_crate_pipeline/
|
343
|
+
```
|
344
|
+
|
345
|
+
### Code Quality
|
346
|
+
|
347
|
+
```bash
|
348
|
+
# Format code
|
349
|
+
black rust_crate_pipeline/
|
350
|
+
|
351
|
+
# Sort imports
|
352
|
+
isort rust_crate_pipeline/
|
353
|
+
|
354
|
+
# Type checking
|
355
|
+
pyright rust_crate_pipeline/
|
356
|
+
|
357
|
+
# Lint code
|
358
|
+
flake8 rust_crate_pipeline/
|
359
|
+
```
|
360
|
+
|
361
|
+
### Building and Publishing
|
362
|
+
|
363
|
+
```bash
|
364
|
+
# Build package
|
365
|
+
python -m build
|
366
|
+
|
367
|
+
# Upload to PyPI (requires PYPI_API_TOKEN)
|
368
|
+
python -m twine upload dist/*
|
369
|
+
|
370
|
+
# Create release
|
371
|
+
python scripts/create_release.py
|
372
|
+
```
|
373
|
+
|
374
|
+
### Docker Development
|
375
|
+
|
376
|
+
```bash
|
377
|
+
# Build Docker image
|
378
|
+
docker build -t rust-crate-pipeline .
|
379
|
+
|
380
|
+
# Run in Docker
|
381
|
+
docker run -it rust-crate-pipeline
|
382
|
+
|
383
|
+
# Run with volume mount for development
|
384
|
+
docker run -it -v $(pwd):/app rust-crate-pipeline
|
385
|
+
```
|
386
|
+
|
387
|
+
## Recent Improvements
|
388
|
+
|
389
|
+
### Version 1.4.0
|
390
|
+
- **Security**: Robust Ed25519/RSA cryptographic signing and provenance
|
391
|
+
- **Automation**: Automated RAG and provenance workflows
|
392
|
+
- **CI/CD**: Improved GitHub Actions for validation and publishing
|
393
|
+
- **Docker**: Updated Docker image and compose for new version
|
394
|
+
- **Bug Fixes**: Workflow and validation fixes for Ed25519
|
395
|
+
|
396
|
+
### Version 1.3.6
|
397
|
+
- **Python 3.12+ Requirement**: Updated to use modern type annotations and language features
|
398
|
+
- **Type Safety**: Enhanced type annotations throughout the codebase with modern syntax
|
399
|
+
- **Build System**: Updated pyproject.toml and setup.py for better compatibility
|
400
|
+
|
401
|
+
### Version 1.3.5
|
402
|
+
- **Enhanced Web Scraping**: Added Playwright-based scraping for better data extraction
|
403
|
+
- **Unicode Compatibility**: Replaced all Unicode symbols with ASCII equivalents for better cross-platform support
|
404
|
+
- **Automatic Dependencies**: All required packages are now automatically installed
|
405
|
+
- **Real-time Progress**: Added CLI-based progress monitoring with ASCII status indicators
|
406
|
+
- **Docker Optimization**: Updated Dockerfile to include Playwright browser installation
|
407
|
+
|
408
|
+
### Version 1.3.4
|
409
|
+
- **PEP8 Compliance**: Fixed all Unicode emoji and symbols for better encoding support
|
410
|
+
- **Cross-platform Compatibility**: Improved compatibility across different operating systems
|
411
|
+
- **Type Safety**: Enhanced type annotations throughout the codebase
|
412
|
+
|
413
|
+
### Version 1.3.3
|
414
|
+
- **Real-time Progress Monitoring**: Added CLI-only progress tracking feature
|
415
|
+
- **Enhanced Logging**: Improved status reporting and error handling
|
416
|
+
|
417
|
+
### Version 1.3.2
|
418
|
+
- **Multi-Provider LLM Support**: Added support for OpenAI, Azure OpenAI, Ollama, LM Studio, and LiteLLM
|
419
|
+
- **Unified LLM Processor**: Centralized LLM processing with provider abstraction
|
420
|
+
- **Enhanced Error Handling**: Better error recovery and retry mechanisms
|
421
|
+
|
422
|
+
## License
|
423
|
+
|
424
|
+
MIT License - see LICENSE file for details.
|
425
|
+
|
426
|
+
## Contributing
|
427
|
+
|
428
|
+
1. Fork the repository
|
429
|
+
2. Create a feature branch
|
430
|
+
3. Make your changes
|
431
|
+
4. Add tests for new functionality
|
432
|
+
5. Submit a pull request
|
433
|
+
|
434
|
+
## Support
|
435
|
+
|
436
|
+
For issues and questions:
|
437
|
+
- GitHub Issues: https://github.com/Superuser666-Sigil/SigilDERG-Data_Production/issues
|
438
|
+
- Documentation: https://github.com/Superuser666-Sigil/SigilDERG-Data_Production#readme
|
439
|
+
|
440
|
+
## API Compliance & Attribution
|
441
|
+
|
442
|
+
### crates.io and GitHub API Usage
|
443
|
+
- This project accesses crates.io and GitHub APIs for data gathering and verification.
|
444
|
+
- **User-Agent:** All requests use:
|
445
|
+
|
446
|
+
`SigilDERG-Data-Production (Superuser666-Sigil; miragemodularframework@gmail.com; https://github.com/Superuser666-Sigil/SigilDERG-Data_Production)`
|
447
|
+
- **Contact:** miragemodularframework@gmail.com
|
448
|
+
- **GitHub:** [Superuser666-Sigil/SigilDERG-Data_Production](https://github.com/Superuser666-Sigil/SigilDERG-Data_Production)
|
449
|
+
- The project respects all rate limits and crawler policies. If you have questions or concerns, please contact us.
|
450
|
+
|
451
|
+
### Crawl4AI Attribution
|
452
|
+
This project uses [Crawl4AI](https://github.com/unclecode/crawl4ai) for web data extraction.
|
453
|
+
|
454
|
+
<!-- Badge Attribution (Disco Theme) -->
|
455
|
+
<a href="https://github.com/unclecode/crawl4ai">
|
456
|
+
<img src="https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/assets/powered-by-disco.svg" alt="Powered by Crawl4AI" width="200"/>
|
457
|
+
</a>
|
458
|
+
|
459
|
+
Or, text attribution:
|
460
|
+
|
461
|
+
```
|
462
|
+
This project uses Crawl4AI (https://github.com/unclecode/crawl4ai) for web data extraction.
|
463
|
+
```
|
464
|
+
|
465
|
+
## 🚀 Unified, Cross-Platform, Multi-Provider LLM Support
|
466
|
+
|
467
|
+
This project supports **all major LLM providers** (cloud and local) on **Mac, Linux, and Windows** using a single, unified interface. All LLM calls are routed through the `UnifiedLLMProcessor` and `LLMConfig` abstractions, ensuring:
|
468
|
+
|
469
|
+
- **One code path for all providers:** Azure OpenAI, OpenAI, Anthropic, Google, Cohere, HuggingFace, Ollama, LM Studio, and any OpenAI-compatible endpoint.
|
470
|
+
- **Cross-platform compatibility:** Works out of the box on Mac, Linux, and Windows.
|
471
|
+
- **Configurable via CLI and config files:** Select provider, model, API key, endpoint, and provider-specific options at runtime.
|
472
|
+
- **Easy extensibility:** Add new providers by updating your config or CLI arguments—no code changes needed.
|
473
|
+
|
474
|
+
### 📖 Provider Setup & Usage
|
475
|
+
- See [`README_LLM_PROVIDERS.md`](./README_LLM_PROVIDERS.md) for full details, setup instructions, and usage examples for every supported provider.
|
476
|
+
- Run `python run_pipeline_with_llm.py --help` for CLI options and provider-specific arguments.
|
477
|
+
|
478
|
+
### 🧩 Example Usage
|
479
|
+
```bash
|
480
|
+
# Azure OpenAI
|
481
|
+
python run_pipeline_with_llm.py --llm-provider azure --llm-model gpt-4o --crates tokio
|
482
|
+
|
483
|
+
# Ollama (local)
|
484
|
+
python run_pipeline_with_llm.py --llm-provider ollama --llm-model llama2 --crates serde
|
485
|
+
|
486
|
+
# OpenAI API
|
487
|
+
python run_pipeline_with_llm.py --llm-provider openai --llm-model gpt-4 --llm-api-key YOUR_KEY --crates tokio
|
488
|
+
|
489
|
+
# Anthropic Claude
|
490
|
+
python run_pipeline_with_llm.py --llm-provider anthropic --llm-model claude-3-sonnet --llm-api-key YOUR_KEY --crates serde
|
491
|
+
```
|
492
|
+
|
493
|
+
### 🔒 Security & Best Practices
|
494
|
+
- Store API keys as environment variables.
|
495
|
+
- Use local providers (Ollama, LM Studio) for full privacy—no data leaves your machine.
|
496
|
+
- All LLM calls are routed through a single, auditable interface for maximum maintainability and security.
|
497
|
+
|
498
|
+
### 🧪 Testing
|
499
|
+
- Run `python test_unified_llm.py` to verify provider support and configuration.
|
500
|
+
|
501
|
+
For more, see [`README_LLM_PROVIDERS.md`](./README_LLM_PROVIDERS.md) and the CLI help output.
|
502
|
+
|
503
|
+
## Public RAG Database Hash Verification
|
504
|
+
|
505
|
+
The canonical hash of the RAG SQLite database (`sigil_rag_cache.db`) is stored in the public file `sigil_rag_cache.hash`.
|
506
|
+
|
507
|
+
- **Purpose:** Anyone can verify the integrity of the RAG database by comparing its SHA256 hash to the value in `sigil_rag_cache.hash`.
|
508
|
+
- **How to verify:**
|
509
|
+
|
510
|
+
```sh
|
511
|
+
python audits/validate_db_hash.py --db sigil_rag_cache.db --expected-hash "$(cat sigil_rag_cache.hash)"
|
512
|
+
```
|
513
|
+
|
514
|
+
- **CI/CD:** The GitHub Actions workflow `.github/workflows/validate-db-hash.yml` automatically checks this on every push.
|
515
|
+
- **No secrets required:** The hash is public and verifiable by anyone.
|
@@ -0,0 +1,31 @@
|
|
1
|
+
rust_crate_pipeline/__init__.py,sha256=ZJCApGu8h2Rn5-dkoBLXOpdoeD6b36w76--o0fEismQ,1749
|
2
|
+
rust_crate_pipeline/__main__.py,sha256=PexSWQYtbFQg5P36WEnJ0X-oAtT8WDej3bIJoSAcCCQ,157
|
3
|
+
rust_crate_pipeline/ai_processing.py,sha256=Q_jmIL0OzFcP6zSKTgrIikUTHuUB3Py4MqwLXmB7-KQ,29057
|
4
|
+
rust_crate_pipeline/analysis.py,sha256=_cmjynLWaQbGIdLQHU3P3rfqHB3gcNNgCdzStbsKrdw,17021
|
5
|
+
rust_crate_pipeline/azure_ai_processing.py,sha256=h2ZUaFPt5LmTH--5CXfXBdbKnoJA4Ha8zCfbLawhDz8,16409
|
6
|
+
rust_crate_pipeline/config.py,sha256=Fw3fRKCZawKaLQi7YqsmNNku4whZi89mWzr8BVRNS5E,3009
|
7
|
+
rust_crate_pipeline/crate_analysis.py,sha256=GsoXemJ9VFyAbb4Sm5gY5ToTqNtOA4pI38AtngAQONk,2090
|
8
|
+
rust_crate_pipeline/crate_list.txt,sha256=W3NxDtxvihyKp9SN85FYXX6p8Hh49IFih1M4-c-CynM,4334
|
9
|
+
rust_crate_pipeline/github_token_checker.py,sha256=0IpTh78DSaw4znaed031cSVSZDsi92eDManPzRIIN3Y,3670
|
10
|
+
rust_crate_pipeline/main.py,sha256=iGYEAYvXkoFFvaA6DIVGiUL3wLhiCzatB6Fvf-Yrj2A,18858
|
11
|
+
rust_crate_pipeline/network.py,sha256=mWjiRvOX31piBZ2QiJ-F75DBD4l6cqzTXcQdJvHxe90,12718
|
12
|
+
rust_crate_pipeline/pipeline.py,sha256=CqPHLLRvMOpy-3ONL6hnPahV6Vh6S4M8oDsHd_lDrPc,16203
|
13
|
+
rust_crate_pipeline/production_config.py,sha256=uWylP9AIZZx7-9aT4sFmAKEEW9miJDxaiek8VE6WP-0,2372
|
14
|
+
rust_crate_pipeline/progress_monitor.py,sha256=5K9KP-Xggi1JEINfRmq2W-wGUHtNIBTcocpDtB1t8iM,13743
|
15
|
+
rust_crate_pipeline/unified_llm_processor.py,sha256=eo7KotNuqwc7_hgpFm18QLokFoufFslnvi8TnDsSYEg,25064
|
16
|
+
rust_crate_pipeline/unified_pipeline.py,sha256=2yglmXVlQfSkVq0HVTPonDee6VxWaQWZw0X2l4lLBGw,23704
|
17
|
+
rust_crate_pipeline/version.py,sha256=whkmTDquEVytez4svUFUBfbfK0EOvDTPA8K5TuZffbE,4481
|
18
|
+
rust_crate_pipeline/core/__init__.py,sha256=Sq4HWdANGqoYln7JdCog7m3BsGeR3tHdseeflvNetoQ,509
|
19
|
+
rust_crate_pipeline/core/canon_registry.py,sha256=_3cu0akJvLc7ZnomMaLeMa8adOBYn1dtjpB0yE3vGL8,4700
|
20
|
+
rust_crate_pipeline/core/irl_engine.py,sha256=QRZUdkN24W9XutLkj8JDplEz6FmnquUrwKsl0s2zRr4,10491
|
21
|
+
rust_crate_pipeline/core/sacred_chain.py,sha256=6s4gFLDT6KUwuu0Fpxu6h_YHlsEvHZb3CQw4tRHGyDU,3773
|
22
|
+
rust_crate_pipeline/scraping/__init__.py,sha256=ySkTRg7nIxgcbHJQ3L1XzcrOo281NZu07-XtiGi-558,307
|
23
|
+
rust_crate_pipeline/scraping/unified_scraper.py,sha256=ZE2gkc0vQ3BOLdSX_IV-kMe8QAm2Av4M7VqpkxEKyT4,9965
|
24
|
+
rust_crate_pipeline/utils/file_utils.py,sha256=tMaCPy7ghs9x4Hxu_sviX8MXU2sBjNvohUrvt4MejoM,2853
|
25
|
+
rust_crate_pipeline/utils/logging_utils.py,sha256=e5jG0Yd6k3exgAdbVca46kWADJ_Qz8UJ3yEJzwTqPyI,2452
|
26
|
+
rust_crate_pipeline-1.4.1.dist-info/licenses/LICENSE,sha256=tpd4XNpbssrSx9-iErATOLrOh0ivNPfO2I5MAPUpats,1088
|
27
|
+
rust_crate_pipeline-1.4.1.dist-info/METADATA,sha256=OY5aKfWvpdRnLr9oKJ0SyX1N6evt1IYvu4J4GuFjwy0,17605
|
28
|
+
rust_crate_pipeline-1.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
29
|
+
rust_crate_pipeline-1.4.1.dist-info/entry_points.txt,sha256=9Rr_IRuFRIridXxUSdEJbB3ba0NnpEfKmknZXFdYRC0,70
|
30
|
+
rust_crate_pipeline-1.4.1.dist-info/top_level.txt,sha256=GUdB7RyxHLhijQxui_KTy3B8p_L2APui9C6RYa0FuaE,20
|
31
|
+
rust_crate_pipeline-1.4.1.dist-info/RECORD,,
|