rag-knowledge-preparation 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rag_knowledge_preparation-1.0.0/CHANGELOG.md +7 -0
- rag_knowledge_preparation-1.0.0/LICENSE +21 -0
- rag_knowledge_preparation-1.0.0/MANIFEST.in +4 -0
- rag_knowledge_preparation-1.0.0/PKG-INFO +575 -0
- rag_knowledge_preparation-1.0.0/README.md +519 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/__init__.py +65 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/__init__.py +73 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/analysis/CodeAnalyzer.py +211 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/analysis/CodeSummarizer.py +162 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/analysis/DependencyAnalyzer.py +119 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/analysis/__init__.py +27 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/core/CodebaseConfig.py +171 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/core/CodebaseConverter.py +83 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/core/__init__.py +27 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/export/CodebaseExporter.py +345 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/export/__init__.py +11 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/utils/CodebaseConstants.py +34 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/utils/FileUtils.py +102 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/utils/__init__.py +35 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/document_processing/DocumentConverter.py +260 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/document_processing/DocumentProcessingConfig.py +126 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/document_processing/__init__.py +31 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/utils/CustomExceptions.py +18 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/utils/__init__.py +0 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation.egg-info/PKG-INFO +575 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation.egg-info/SOURCES.txt +40 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation.egg-info/dependency_links.txt +1 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation.egg-info/requires.txt +24 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation.egg-info/top_level.txt +1 -0
- rag_knowledge_preparation-1.0.0/rag_knowledge_preparation.egg-info/zip-safe +1 -0
- rag_knowledge_preparation-1.0.0/setup.cfg +12 -0
- rag_knowledge_preparation-1.0.0/setup.py +71 -0
- rag_knowledge_preparation-1.0.0/tests/test_CodebaseAnalyzer.py +327 -0
- rag_knowledge_preparation-1.0.0/tests/test_CodebasePerformanceOptimizer.py +213 -0
- rag_knowledge_preparation-1.0.0/tests/test_CustomExceptions.py +78 -0
- rag_knowledge_preparation-1.0.0/tests/test_DocumentBatchProcessor.py +189 -0
- rag_knowledge_preparation-1.0.0/tests/test_DocumentConfiguration.py +192 -0
- rag_knowledge_preparation-1.0.0/tests/test_DocumentConverter.py +97 -0
- rag_knowledge_preparation-1.0.0/tests/test_DocumentErrorHandler.py +94 -0
- rag_knowledge_preparation-1.0.0/tests/test_DocumentFormatSupport.py +63 -0
- rag_knowledge_preparation-1.0.0/tests/test_DocumentProcessingModes.py +143 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 lib-ragknowledgepreparation-python
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,575 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: rag_knowledge_preparation
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: RAG Knowledge Preparation in Python
|
|
5
|
+
Home-page: https://bitbucket.org/entinco/eic-aimodelknowledge-utils/src/master/lib-ragknowledgepreparation-python
|
|
6
|
+
Author: Enterprise Innovation Consulting LLC
|
|
7
|
+
Author-email: seroukhov@entinco.com
|
|
8
|
+
License: Commercial
|
|
9
|
+
Platform: any
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: Other/Proprietary License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: pytest
|
|
22
|
+
Requires-Dist: requests<3.0.0,>=2.27.1
|
|
23
|
+
Requires-Dist: urllib3<2.0.0,>=1.26.8
|
|
24
|
+
Requires-Dist: docling>=1.3.0
|
|
25
|
+
Requires-Dist: tiktoken>=0.5.0
|
|
26
|
+
Requires-Dist: pathspec>=0.11.0
|
|
27
|
+
Requires-Dist: tree-sitter>=0.20.0
|
|
28
|
+
Requires-Dist: tree-sitter-python>=0.20.0
|
|
29
|
+
Requires-Dist: tree-sitter-javascript>=0.20.0
|
|
30
|
+
Requires-Dist: tree-sitter-typescript>=0.20.0
|
|
31
|
+
Requires-Dist: pygments>=2.15.0
|
|
32
|
+
Requires-Dist: pydantic>=2.0.0
|
|
33
|
+
Requires-Dist: chardet>=5.0.0
|
|
34
|
+
Requires-Dist: google-generativeai>=0.3.0
|
|
35
|
+
Requires-Dist: pip-services4-commons>=0.0.0
|
|
36
|
+
Requires-Dist: pip-services4-components>=0.0.0
|
|
37
|
+
Requires-Dist: pip-services4-config>=0.0.0
|
|
38
|
+
Requires-Dist: pip-services4-data>=0.0.0
|
|
39
|
+
Requires-Dist: pip-services4-http>=0.0.0
|
|
40
|
+
Requires-Dist: pip-services4-mongodb>=0.0.0
|
|
41
|
+
Requires-Dist: pip-services4-persistence>=0.0.0
|
|
42
|
+
Requires-Dist: pip-services4-prometheus>=0.0.0
|
|
43
|
+
Requires-Dist: pip-services4-rpc>=0.0.0
|
|
44
|
+
Requires-Dist: pip-services4-swagger>=0.0.0
|
|
45
|
+
Dynamic: author
|
|
46
|
+
Dynamic: author-email
|
|
47
|
+
Dynamic: classifier
|
|
48
|
+
Dynamic: description
|
|
49
|
+
Dynamic: description-content-type
|
|
50
|
+
Dynamic: home-page
|
|
51
|
+
Dynamic: license
|
|
52
|
+
Dynamic: license-file
|
|
53
|
+
Dynamic: platform
|
|
54
|
+
Dynamic: requires-dist
|
|
55
|
+
Dynamic: summary
|
|
56
|
+
|
|
57
|
+
# RAG Knowledge Preparation Python
|
|
58
|
+
|
|
59
|
+
A comprehensive Python library for preparing knowledge bases for Retrieval-Augmented Generation (RAG) systems. This library provides powerful tools for document processing with OCR capabilities, advanced table processing, and intelligent codebase analysis.
|
|
60
|
+
|
|
61
|
+
## Features
|
|
62
|
+
|
|
63
|
+
### Document Processing Features
|
|
64
|
+
|
|
65
|
+
- **Multi-format Support**: Convert PDF, DOCX, HTML, CSV, and other formats to Markdown
|
|
66
|
+
- **OCR Integration**: Extract text from scanned documents using EasyOCR or Tesseract
|
|
67
|
+
- **Advanced Table Processing**: Intelligent table detection and conversion using TableFormer
|
|
68
|
+
- **Batch Processing**: Process multiple documents or entire folders efficiently
|
|
69
|
+
- **Configurable Quality**: Multiple processing presets for different use cases
|
|
70
|
+
|
|
71
|
+
### Codebase Analysis Features
|
|
72
|
+
|
|
73
|
+
- **Comprehensive Analysis**: Extract structure, dependencies, and metadata from codebases
|
|
74
|
+
- **Multi-language Support**: Python, JavaScript, TypeScript and more
|
|
75
|
+
- **AI-Powered Summaries**: Generate intelligent code summaries using Google Gemini
|
|
76
|
+
- **Dependency Analysis**: Identify and categorize internal, external, and standard library dependencies
|
|
77
|
+
- **Structure Extraction**: Parse classes, functions, imports, and code organization
|
|
78
|
+
- **Token Estimation**: Accurate token counting for RAG optimization
|
|
79
|
+
|
|
80
|
+
### Configuration & Customization
|
|
81
|
+
|
|
82
|
+
- **Flexible Configuration**: Extensive configuration options for both document and codebase processing
|
|
83
|
+
- **Preset Configurations**: Pre-built configurations for common use cases
|
|
84
|
+
- **Custom Metadata**: Configurable metadata fields for different analysis needs
|
|
85
|
+
- **Performance Optimization**: Built-in performance modes for large-scale processing
|
|
86
|
+
|
|
87
|
+
## Installation
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
pip install rag-knowledge-preparation-python
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Development Installation
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
git clone
|
|
97
|
+
cd rag-knowledge-preparation-python
|
|
98
|
+
pip install -e ".[dev]"
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Quick Start
|
|
102
|
+
|
|
103
|
+
### Document Processing
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from rag_knowledge_preparation import (
|
|
107
|
+
convert_document_to_markdown,
|
|
108
|
+
convert_scanned_document_to_markdown,
|
|
109
|
+
convert_documents_batch
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Convert a single document
|
|
113
|
+
markdown_content = convert_document_to_markdown("document.pdf")
|
|
114
|
+
|
|
115
|
+
# Convert a scanned document with OCR
|
|
116
|
+
scanned_content = convert_scanned_document_to_markdown("scanned_document.pdf")
|
|
117
|
+
|
|
118
|
+
# Process multiple documents
|
|
119
|
+
results = convert_documents_batch(["doc1.pdf", "doc2.docx", "doc3.html"])
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Codebase Analysis
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from rag_knowledge_preparation import (
|
|
126
|
+
export_codebase_to_markdown,
|
|
127
|
+
analyze_codebase_structure,
|
|
128
|
+
get_codebase_overview
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Export entire codebase to Markdown
|
|
132
|
+
output_file = export_codebase_to_markdown("./my_project", "codebase_export.md")
|
|
133
|
+
|
|
134
|
+
# Analyze codebase structure
|
|
135
|
+
structure = analyze_codebase_structure("./my_project")
|
|
136
|
+
|
|
137
|
+
# Get high-level overview
|
|
138
|
+
overview = get_codebase_overview("./my_project")
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## Document Processing Details
|
|
142
|
+
|
|
143
|
+
### Supported Formats
|
|
144
|
+
|
|
145
|
+
- **PDF**: Native PDF processing with OCR support
|
|
146
|
+
- **Microsoft Office**: DOCX, DOC, PPTX, PPT
|
|
147
|
+
- **Web Formats**: HTML, XML
|
|
148
|
+
- **Data Formats**: CSV, TSV, JSON
|
|
149
|
+
- **Text Formats**: TXT, MD, RST
|
|
150
|
+
|
|
151
|
+
### Processing Presets
|
|
152
|
+
|
|
153
|
+
#### Basic Processing
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
from rag_knowledge_preparation import convert_document_to_markdown
|
|
157
|
+
|
|
158
|
+
# Basic processing without OCR
|
|
159
|
+
content = convert_document_to_markdown(
|
|
160
|
+
"document.pdf",
|
|
161
|
+
processing_preset="basic"
|
|
162
|
+
)
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
#### Standard Document Processing
|
|
166
|
+
|
|
167
|
+
```python
|
|
168
|
+
# Standard processing with OCR and advanced tables
|
|
169
|
+
content = convert_document_to_markdown(
|
|
170
|
+
"document.pdf",
|
|
171
|
+
processing_preset="standard"
|
|
172
|
+
)
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
#### OCR-Heavy Processing
|
|
176
|
+
|
|
177
|
+
```python
|
|
178
|
+
# Heavy OCR processing for scanned documents
|
|
179
|
+
content = convert_document_to_markdown(
|
|
180
|
+
"scanned_document.pdf",
|
|
181
|
+
processing_preset="ocr_heavy"
|
|
182
|
+
)
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
#### Table-Focused Processing
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
# Optimized for documents with complex tables
|
|
189
|
+
content = convert_document_to_markdown(
|
|
190
|
+
"data_heavy_document.pdf",
|
|
191
|
+
processing_preset="table_focused"
|
|
192
|
+
)
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
#### High-Quality Processing
|
|
196
|
+
|
|
197
|
+
```python
|
|
198
|
+
# Maximum quality with all features enabled
|
|
199
|
+
content = convert_document_to_markdown(
|
|
200
|
+
"important_document.pdf",
|
|
201
|
+
processing_preset="high_quality"
|
|
202
|
+
)
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
### Custom Configuration
|
|
206
|
+
|
|
207
|
+
```python
|
|
208
|
+
from rag_knowledge_preparation import convert_document_to_markdown
|
|
209
|
+
|
|
210
|
+
# Custom configuration
|
|
211
|
+
content = convert_document_to_markdown(
|
|
212
|
+
"document.pdf",
|
|
213
|
+
processing_preset="standard",
|
|
214
|
+
ocr_engine="tesseract",
|
|
215
|
+
ocr_language="en",
|
|
216
|
+
table_confidence_threshold=0.9,
|
|
217
|
+
enable_cell_matching=True
|
|
218
|
+
)
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
### Batch Processing
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
from rag_knowledge_preparation import convert_documents_batch, convert_folder_to_markdown
|
|
225
|
+
|
|
226
|
+
# Process multiple files
|
|
227
|
+
results = convert_documents_batch([
|
|
228
|
+
"document1.pdf",
|
|
229
|
+
"document2.docx",
|
|
230
|
+
"document3.html"
|
|
231
|
+
])
|
|
232
|
+
|
|
233
|
+
# Process entire folder
|
|
234
|
+
folder_results = convert_folder_to_markdown("./documents/")
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
## Codebase Analysis Usage
|
|
238
|
+
|
|
239
|
+
### Basic Analysis
|
|
240
|
+
|
|
241
|
+
```python
|
|
242
|
+
from rag_knowledge_preparation import analyze_codebase_structure
|
|
243
|
+
|
|
244
|
+
# Analyze codebase structure
|
|
245
|
+
structure = analyze_codebase_structure("./my_project")
|
|
246
|
+
|
|
247
|
+
print(f"Total files: {structure['total_files']}")
|
|
248
|
+
print(f"Total lines: {structure['total_lines']}")
|
|
249
|
+
print(f"Languages: {structure['languages']}")
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
### Export to Markdown
|
|
253
|
+
|
|
254
|
+
```python
|
|
255
|
+
from rag_knowledge_preparation import export_codebase_to_markdown
|
|
256
|
+
|
|
257
|
+
# Export with default settings
|
|
258
|
+
output_file = export_codebase_to_markdown("./my_project")
|
|
259
|
+
|
|
260
|
+
# Export with custom output file
|
|
261
|
+
output_file = export_codebase_to_markdown(
|
|
262
|
+
"./my_project",
|
|
263
|
+
output_file="my_codebase.md"
|
|
264
|
+
)
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
### AI-Powered Analysis
|
|
268
|
+
|
|
269
|
+
```python
|
|
270
|
+
from rag_knowledge_preparation import export_codebase_to_markdown
|
|
271
|
+
|
|
272
|
+
# Export with AI summaries (requires Gemini API key)
|
|
273
|
+
output_file = export_codebase_to_markdown(
|
|
274
|
+
"./my_project",
|
|
275
|
+
gemini_api_key="your-gemini-api-key",
|
|
276
|
+
gemini_model="gemini-pro"
|
|
277
|
+
)
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
### Codebase Processing Presets
|
|
281
|
+
|
|
282
|
+
#### Minimal Processing
|
|
283
|
+
|
|
284
|
+
```python
|
|
285
|
+
from rag_knowledge_preparation import export_codebase_to_markdown
|
|
286
|
+
|
|
287
|
+
# Minimal processing - basic analysis only
|
|
288
|
+
output_file = export_codebase_to_markdown(
|
|
289
|
+
"./my_project",
|
|
290
|
+
processing_preset="minimal"
|
|
291
|
+
)
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
#### Standard Processing
|
|
295
|
+
|
|
296
|
+
```python
|
|
297
|
+
# Standard processing with full analysis
|
|
298
|
+
output_file = export_codebase_to_markdown(
|
|
299
|
+
"./my_project",
|
|
300
|
+
processing_preset="standard"
|
|
301
|
+
)
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
#### Comprehensive Processing
|
|
305
|
+
|
|
306
|
+
```python
|
|
307
|
+
# Comprehensive processing with all features
|
|
308
|
+
output_file = export_codebase_to_markdown(
|
|
309
|
+
"./my_project",
|
|
310
|
+
processing_preset="comprehensive"
|
|
311
|
+
)
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
### Configuration Options
|
|
315
|
+
|
|
316
|
+
```python
|
|
317
|
+
from rag_knowledge_preparation import (
|
|
318
|
+
CodebaseProcessingConfig,
|
|
319
|
+
MetadataConfig,
|
|
320
|
+
export_codebase_to_markdown
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
# Custom configuration
|
|
324
|
+
config = CodebaseProcessingConfig(
|
|
325
|
+
max_file_size_mb=2.0,
|
|
326
|
+
include_test_files=False,
|
|
327
|
+
include_documentation=True,
|
|
328
|
+
enable_ai_summary=True,
|
|
329
|
+
gemini_api_key="your-api-key",
|
|
330
|
+
custom_ignore_patterns=["*.log", "temp/*"]
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
# Custom metadata configuration
|
|
334
|
+
metadata_config = MetadataConfig(
|
|
335
|
+
include_file_path=True,
|
|
336
|
+
include_language=True,
|
|
337
|
+
include_purpose=True,
|
|
338
|
+
include_dependencies=True,
|
|
339
|
+
include_structure=True,
|
|
340
|
+
include_summary=True
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
config.metadata_config = metadata_config
|
|
344
|
+
|
|
345
|
+
# Use custom configuration
|
|
346
|
+
output_file = export_codebase_to_markdown(
|
|
347
|
+
"./my_project",
|
|
348
|
+
processing_preset="custom",
|
|
349
|
+
**config.model_dump()
|
|
350
|
+
)
|
|
351
|
+
```
|
|
352
|
+
|
|
353
|
+
## Advanced Features
|
|
354
|
+
|
|
355
|
+
### Language Detection and Classification
|
|
356
|
+
|
|
357
|
+
The library automatically detects programming languages and classifies files by purpose:
|
|
358
|
+
|
|
359
|
+
```python
|
|
360
|
+
from rag_knowledge_preparation.codebase_processing.analysis import (
|
|
361
|
+
get_language_from_extension,
|
|
362
|
+
classify_file_by_purpose
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
# Detect language from file extension
|
|
366
|
+
language = get_language_from_extension("script.py") # Returns "python"
|
|
367
|
+
|
|
368
|
+
# Classify file by purpose
|
|
369
|
+
purpose = classify_file_by_purpose("test_utils.py") # Returns "Tests"
|
|
370
|
+
```
|
|
371
|
+
|
|
372
|
+
### Dependency Analysis
|
|
373
|
+
|
|
374
|
+
```python
|
|
375
|
+
from pathlib import Path
|
|
376
|
+
from rag_knowledge_preparation.codebase_processing.analysis import analyze_file_dependencies
|
|
377
|
+
|
|
378
|
+
# Analyze dependencies in a Python file
|
|
379
|
+
with open("main.py", "r") as f:
|
|
380
|
+
content = f.read()
|
|
381
|
+
dependencies = analyze_file_dependencies(content, Path("main.py"), "python")
|
|
382
|
+
|
|
383
|
+
print("External packages:", dependencies["external_packages"])
|
|
384
|
+
print("Standard library:", dependencies["standard_library"])
|
|
385
|
+
print("Internal modules:", dependencies["internal_modules"])
|
|
386
|
+
```
|
|
387
|
+
|
|
388
|
+
### Code Structure Extraction
|
|
389
|
+
|
|
390
|
+
```python
|
|
391
|
+
from pathlib import Path
|
|
392
|
+
from rag_knowledge_preparation.codebase_processing.analysis import extract_code_structure
|
|
393
|
+
|
|
394
|
+
# Extract structure from code file
|
|
395
|
+
code_content = """
|
|
396
|
+
class MyClass:
|
|
397
|
+
def __init__(self):
|
|
398
|
+
pass
|
|
399
|
+
|
|
400
|
+
def method(self):
|
|
401
|
+
pass
|
|
402
|
+
"""
|
|
403
|
+
structure = extract_code_structure(Path("example.py"), "python", code_content)
|
|
404
|
+
|
|
405
|
+
print("Classes:", structure["classes"])
|
|
406
|
+
print("Functions:", structure["functions"])
|
|
407
|
+
```
|
|
408
|
+
|
|
409
|
+
### Token Estimation
|
|
410
|
+
|
|
411
|
+
```python
|
|
412
|
+
from rag_knowledge_preparation.codebase_processing.analysis import estimate_token_count
|
|
413
|
+
|
|
414
|
+
# Estimate tokens in text
|
|
415
|
+
token_count = estimate_token_count("Hello, world!")
|
|
416
|
+
print(f"Estimated tokens: {token_count}")
|
|
417
|
+
|
|
418
|
+
# Estimate tokens in code
|
|
419
|
+
code_tokens = estimate_token_count("""
|
|
420
|
+
def hello():
|
|
421
|
+
print("Hello, world!")
|
|
422
|
+
""")
|
|
423
|
+
```
|
|
424
|
+
|
|
425
|
+
## Configuration Reference
|
|
426
|
+
|
|
427
|
+
### Document Processing Configuration
|
|
428
|
+
|
|
429
|
+
| Parameter | Type | Default | Description |
|
|
430
|
+
|-----------|------|---------|-------------|
|
|
431
|
+
| `enable_ocr` | bool | True | Enable OCR processing |
|
|
432
|
+
| `table_processing` | str | "advanced" | Table processing mode (basic, advanced, tableformer) |
|
|
433
|
+
| `ocr_engine` | str | "easyocr" | OCR engine (easyocr, tesseract) |
|
|
434
|
+
| `ocr_language` | str | "en" | OCR language (en, fr, de, es) |
|
|
435
|
+
| `table_confidence_threshold` | float | 0.8 | Table detection confidence threshold |
|
|
436
|
+
| `enable_cell_matching` | bool | True | Enable cell matching in tables |
|
|
437
|
+
| `enable_table_structure` | bool | True | Enable table structure analysis |
|
|
438
|
+
|
|
439
|
+
### Codebase Processing Configuration
|
|
440
|
+
|
|
441
|
+
| Parameter | Type | Default | Description |
|
|
442
|
+
|-----------|------|---------|-------------|
|
|
443
|
+
| `max_file_size_mb` | float | 1.0 | Maximum file size to process |
|
|
444
|
+
| `include_hidden_files` | bool | False | Include hidden files |
|
|
445
|
+
| `include_test_files` | bool | True | Include test files |
|
|
446
|
+
| `include_documentation` | bool | True | Include documentation files |
|
|
447
|
+
| `include_config_files` | bool | True | Include configuration files |
|
|
448
|
+
| `enable_structure_analysis` | bool | True | Enable code structure analysis |
|
|
449
|
+
| `enable_ai_summary` | bool | True | Enable AI-powered summaries |
|
|
450
|
+
| `gemini_api_key` | str | None | Google Gemini API key |
|
|
451
|
+
| `gemini_model` | str | "gemini-pro" | Gemini model to use |
|
|
452
|
+
| `custom_ignore_patterns` | List[str] | None | Custom ignore patterns |
|
|
453
|
+
|
|
454
|
+
## Error Handling
|
|
455
|
+
|
|
456
|
+
The library provides comprehensive error handling with custom exceptions:
|
|
457
|
+
|
|
458
|
+
```python
|
|
459
|
+
from rag_knowledge_preparation import (
|
|
460
|
+
RAGKnowledgePreparationError,
|
|
461
|
+
DocumentNotFoundError,
|
|
462
|
+
ConfigurationError,
|
|
463
|
+
ConversionError,
|
|
464
|
+
UnsupportedFormatError
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
try:
|
|
468
|
+
content = convert_document_to_markdown("nonexistent.pdf")
|
|
469
|
+
except DocumentNotFoundError as e:
|
|
470
|
+
print(f"Document not found: {e}")
|
|
471
|
+
except ConversionError as e:
|
|
472
|
+
print(f"Conversion failed: {e}")
|
|
473
|
+
except ConfigurationError as e:
|
|
474
|
+
print(f"Configuration error: {e}")
|
|
475
|
+
```
|
|
476
|
+
|
|
477
|
+
## Performance Considerations
|
|
478
|
+
|
|
479
|
+
### Large File Processing
|
|
480
|
+
|
|
481
|
+
The library includes built-in optimizations for large files:
|
|
482
|
+
|
|
483
|
+
- **File Size Limits**: Configurable maximum file size limits
|
|
484
|
+
- **Memory Efficiency**: Streaming processing for large documents
|
|
485
|
+
- **Batch Processing**: Efficient processing of multiple files
|
|
486
|
+
- **Parallel Processing**: Concurrent processing where possible
|
|
487
|
+
|
|
488
|
+
### Performance Modes
|
|
489
|
+
|
|
490
|
+
```python
|
|
491
|
+
# Use performance-optimized settings
|
|
492
|
+
config = CodebaseProcessingConfig(
|
|
493
|
+
max_file_size_mb=0.5, # Smaller file limit
|
|
494
|
+
enable_ai_summary=False, # Disable AI for speed
|
|
495
|
+
enable_structure_analysis=False # Disable structure analysis
|
|
496
|
+
)
|
|
497
|
+
```
|
|
498
|
+
|
|
499
|
+
## Examples
|
|
500
|
+
|
|
501
|
+
### Complete Document Processing Pipeline
|
|
502
|
+
|
|
503
|
+
```python
|
|
504
|
+
from rag_knowledge_preparation import (
|
|
505
|
+
convert_folder_to_markdown,
|
|
506
|
+
list_document_configs
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
# List available configurations
|
|
510
|
+
configs = list_document_configs()
|
|
511
|
+
print("Available configurations:", list(configs.keys()))
|
|
512
|
+
|
|
513
|
+
# Process entire document folder
|
|
514
|
+
results = convert_folder_to_markdown(
|
|
515
|
+
"./documents/",
|
|
516
|
+
processing_preset="high_quality"
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
# Save results
|
|
520
|
+
for file_path, content in results.items():
|
|
521
|
+
output_path = f"processed_{file_path.split('/')[-1]}.md"
|
|
522
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
523
|
+
f.write(content)
|
|
524
|
+
```
|
|
525
|
+
|
|
526
|
+
### Complete Codebase Analysis Pipeline
|
|
527
|
+
|
|
528
|
+
```python
|
|
529
|
+
from rag_knowledge_preparation import (
|
|
530
|
+
export_codebase_to_markdown,
|
|
531
|
+
analyze_codebase_structure,
|
|
532
|
+
get_codebase_overview,
|
|
533
|
+
list_available_codebase_configs
|
|
534
|
+
)
|
|
535
|
+
|
|
536
|
+
# List available configurations
|
|
537
|
+
configs = list_available_codebase_configs()
|
|
538
|
+
print("Available configurations:", list(configs.keys()))
|
|
539
|
+
|
|
540
|
+
# Get overview
|
|
541
|
+
overview = get_codebase_overview("./my_project")
|
|
542
|
+
print(f"Project: {overview['name']}")
|
|
543
|
+
print(f"Files: {overview['total_files']}")
|
|
544
|
+
print(f"Languages: {overview['languages']}")
|
|
545
|
+
|
|
546
|
+
# Analyze structure
|
|
547
|
+
structure = analyze_codebase_structure("./my_project")
|
|
548
|
+
print(f"Structure analysis complete: {structure['total_files']} files processed")
|
|
549
|
+
|
|
550
|
+
# Export to Markdown
|
|
551
|
+
output_file = export_codebase_to_markdown(
|
|
552
|
+
"./my_project",
|
|
553
|
+
output_file="project_analysis.md",
|
|
554
|
+
gemini_api_key="your-api-key"
|
|
555
|
+
)
|
|
556
|
+
print(f"Exported to: {output_file}")
|
|
557
|
+
```
|
|
558
|
+
|
|
559
|
+
## Acknowledgments
|
|
560
|
+
|
|
561
|
+
- [Docling](https://github.com/DS4SD/docling) for document processing capabilities
|
|
562
|
+
- [Tree-sitter](https://tree-sitter.github.io/) for code parsing
|
|
563
|
+
- [Google Gemini](https://ai.google.dev/) for AI-powered summarization
|
|
564
|
+
- [Pygments](https://pygments.org/) for syntax highlighting and language detection
|
|
565
|
+
|
|
566
|
+
## Changelog
|
|
567
|
+
|
|
568
|
+
### Version 1.0.0
|
|
569
|
+
|
|
570
|
+
- Initial release
|
|
571
|
+
- Document processing with OCR support
|
|
572
|
+
- Codebase analysis and export
|
|
573
|
+
- AI-powered summarization
|
|
574
|
+
- Comprehensive configuration options
|
|
575
|
+
- Multi-language support
|