rag-knowledge-preparation 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. rag_knowledge_preparation-1.0.0/CHANGELOG.md +7 -0
  2. rag_knowledge_preparation-1.0.0/LICENSE +21 -0
  3. rag_knowledge_preparation-1.0.0/MANIFEST.in +4 -0
  4. rag_knowledge_preparation-1.0.0/PKG-INFO +575 -0
  5. rag_knowledge_preparation-1.0.0/README.md +519 -0
  6. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/__init__.py +65 -0
  7. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/__init__.py +73 -0
  8. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/analysis/CodeAnalyzer.py +211 -0
  9. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/analysis/CodeSummarizer.py +162 -0
  10. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/analysis/DependencyAnalyzer.py +119 -0
  11. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/analysis/__init__.py +27 -0
  12. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/core/CodebaseConfig.py +171 -0
  13. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/core/CodebaseConverter.py +83 -0
  14. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/core/__init__.py +27 -0
  15. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/export/CodebaseExporter.py +345 -0
  16. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/export/__init__.py +11 -0
  17. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/utils/CodebaseConstants.py +34 -0
  18. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/utils/FileUtils.py +102 -0
  19. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/codebase_processing/utils/__init__.py +35 -0
  20. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/document_processing/DocumentConverter.py +260 -0
  21. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/document_processing/DocumentProcessingConfig.py +126 -0
  22. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/document_processing/__init__.py +31 -0
  23. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/utils/CustomExceptions.py +18 -0
  24. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation/utils/__init__.py +0 -0
  25. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation.egg-info/PKG-INFO +575 -0
  26. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation.egg-info/SOURCES.txt +40 -0
  27. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation.egg-info/dependency_links.txt +1 -0
  28. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation.egg-info/requires.txt +24 -0
  29. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation.egg-info/top_level.txt +1 -0
  30. rag_knowledge_preparation-1.0.0/rag_knowledge_preparation.egg-info/zip-safe +1 -0
  31. rag_knowledge_preparation-1.0.0/setup.cfg +12 -0
  32. rag_knowledge_preparation-1.0.0/setup.py +71 -0
  33. rag_knowledge_preparation-1.0.0/tests/test_CodebaseAnalyzer.py +327 -0
  34. rag_knowledge_preparation-1.0.0/tests/test_CodebasePerformanceOptimizer.py +213 -0
  35. rag_knowledge_preparation-1.0.0/tests/test_CustomExceptions.py +78 -0
  36. rag_knowledge_preparation-1.0.0/tests/test_DocumentBatchProcessor.py +189 -0
  37. rag_knowledge_preparation-1.0.0/tests/test_DocumentConfiguration.py +192 -0
  38. rag_knowledge_preparation-1.0.0/tests/test_DocumentConverter.py +97 -0
  39. rag_knowledge_preparation-1.0.0/tests/test_DocumentErrorHandler.py +94 -0
  40. rag_knowledge_preparation-1.0.0/tests/test_DocumentFormatSupport.py +63 -0
  41. rag_knowledge_preparation-1.0.0/tests/test_DocumentProcessingModes.py +143 -0
@@ -0,0 +1,7 @@
1
+ # ![Pip.Services Logo](https://uploads-ssl.webflow.com/5ea5d3315186cf5ec60c3ee4/5edf1c94ce4c859f2b188094_logo.svg)
2
+
3
+ RAG Knowledge Preparation in Python Changelog
4
+
5
+ ## 1.0.0 (2025-02-13)
6
+
7
+ Initial public release
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 lib-ragknowledgepreparation-python
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,4 @@
1
+ include README.md
2
+ include CHANGELOG.md
3
+ include LICENSE
4
+ recursive-include rag_knowledge_preparation *
@@ -0,0 +1,575 @@
1
+ Metadata-Version: 2.4
2
+ Name: rag_knowledge_preparation
3
+ Version: 1.0.0
4
+ Summary: RAG Knowledge Preparation in Python
5
+ Home-page: https://bitbucket.org/entinco/eic-aimodelknowledge-utils/src/master/lib-ragknowledgepreparation-python
6
+ Author: Enterprise Innovation Consulting LLC
7
+ Author-email: seroukhov@entinco.com
8
+ License: Commercial
9
+ Platform: any
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: Other/Proprietary License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: pytest
22
+ Requires-Dist: requests<3.0.0,>=2.27.1
23
+ Requires-Dist: urllib3<2.0.0,>=1.26.8
24
+ Requires-Dist: docling>=1.3.0
25
+ Requires-Dist: tiktoken>=0.5.0
26
+ Requires-Dist: pathspec>=0.11.0
27
+ Requires-Dist: tree-sitter>=0.20.0
28
+ Requires-Dist: tree-sitter-python>=0.20.0
29
+ Requires-Dist: tree-sitter-javascript>=0.20.0
30
+ Requires-Dist: tree-sitter-typescript>=0.20.0
31
+ Requires-Dist: pygments>=2.15.0
32
+ Requires-Dist: pydantic>=2.0.0
33
+ Requires-Dist: chardet>=5.0.0
34
+ Requires-Dist: google-generativeai>=0.3.0
35
+ Requires-Dist: pip-services4-commons>=0.0.0
36
+ Requires-Dist: pip-services4-components>=0.0.0
37
+ Requires-Dist: pip-services4-config>=0.0.0
38
+ Requires-Dist: pip-services4-data>=0.0.0
39
+ Requires-Dist: pip-services4-http>=0.0.0
40
+ Requires-Dist: pip-services4-mongodb>=0.0.0
41
+ Requires-Dist: pip-services4-persistence>=0.0.0
42
+ Requires-Dist: pip-services4-prometheus>=0.0.0
43
+ Requires-Dist: pip-services4-rpc>=0.0.0
44
+ Requires-Dist: pip-services4-swagger>=0.0.0
45
+ Dynamic: author
46
+ Dynamic: author-email
47
+ Dynamic: classifier
48
+ Dynamic: description
49
+ Dynamic: description-content-type
50
+ Dynamic: home-page
51
+ Dynamic: license
52
+ Dynamic: license-file
53
+ Dynamic: platform
54
+ Dynamic: requires-dist
55
+ Dynamic: summary
56
+
57
+ # RAG Knowledge Preparation Python
58
+
59
+ A comprehensive Python library for preparing knowledge bases for Retrieval-Augmented Generation (RAG) systems. This library provides powerful tools for document processing with OCR capabilities, advanced table processing, and intelligent codebase analysis.
60
+
61
+ ## Features
62
+
63
+ ### Document Processing Features
64
+
65
+ - **Multi-format Support**: Convert PDF, DOCX, HTML, CSV, and other formats to Markdown
66
+ - **OCR Integration**: Extract text from scanned documents using EasyOCR or Tesseract
67
+ - **Advanced Table Processing**: Intelligent table detection and conversion using TableFormer
68
+ - **Batch Processing**: Process multiple documents or entire folders efficiently
69
+ - **Configurable Quality**: Multiple processing presets for different use cases
70
+
71
+ ### Codebase Analysis Features
72
+
73
+ - **Comprehensive Analysis**: Extract structure, dependencies, and metadata from codebases
74
+ - **Multi-language Support**: Python, JavaScript, TypeScript and more
75
+ - **AI-Powered Summaries**: Generate intelligent code summaries using Google Gemini
76
+ - **Dependency Analysis**: Identify and categorize internal, external, and standard library dependencies
77
+ - **Structure Extraction**: Parse classes, functions, imports, and code organization
78
+ - **Token Estimation**: Accurate token counting for RAG optimization
79
+
80
+ ### Configuration & Customization
81
+
82
+ - **Flexible Configuration**: Extensive configuration options for both document and codebase processing
83
+ - **Preset Configurations**: Pre-built configurations for common use cases
84
+ - **Custom Metadata**: Configurable metadata fields for different analysis needs
85
+ - **Performance Optimization**: Built-in performance modes for large-scale processing
86
+
87
+ ## Installation
88
+
89
+ ```bash
90
+ pip install rag-knowledge-preparation-python
91
+ ```
92
+
93
+ ### Development Installation
94
+
95
+ ```bash
96
+ git clone
97
+ cd rag-knowledge-preparation-python
98
+ pip install -e ".[dev]"
99
+ ```
100
+
101
+ ## Quick Start
102
+
103
+ ### Document Processing
104
+
105
+ ```python
106
+ from rag_knowledge_preparation import (
107
+ convert_document_to_markdown,
108
+ convert_scanned_document_to_markdown,
109
+ convert_documents_batch
110
+ )
111
+
112
+ # Convert a single document
113
+ markdown_content = convert_document_to_markdown("document.pdf")
114
+
115
+ # Convert a scanned document with OCR
116
+ scanned_content = convert_scanned_document_to_markdown("scanned_document.pdf")
117
+
118
+ # Process multiple documents
119
+ results = convert_documents_batch(["doc1.pdf", "doc2.docx", "doc3.html"])
120
+ ```
121
+
122
+ ### Codebase Analysis
123
+
124
+ ```python
125
+ from rag_knowledge_preparation import (
126
+ export_codebase_to_markdown,
127
+ analyze_codebase_structure,
128
+ get_codebase_overview
129
+ )
130
+
131
+ # Export entire codebase to Markdown
132
+ output_file = export_codebase_to_markdown("./my_project", "codebase_export.md")
133
+
134
+ # Analyze codebase structure
135
+ structure = analyze_codebase_structure("./my_project")
136
+
137
+ # Get high-level overview
138
+ overview = get_codebase_overview("./my_project")
139
+ ```
140
+
141
+ ## Document Processing Details
142
+
143
+ ### Supported Formats
144
+
145
+ - **PDF**: Native PDF processing with OCR support
146
+ - **Microsoft Office**: DOCX, DOC, PPTX, PPT
147
+ - **Web Formats**: HTML, XML
148
+ - **Data Formats**: CSV, TSV, JSON
149
+ - **Text Formats**: TXT, MD, RST
150
+
151
+ ### Processing Presets
152
+
153
+ #### Basic Processing
154
+
155
+ ```python
156
+ from rag_knowledge_preparation import convert_document_to_markdown
157
+
158
+ # Basic processing without OCR
159
+ content = convert_document_to_markdown(
160
+ "document.pdf",
161
+ processing_preset="basic"
162
+ )
163
+ ```
164
+
165
+ #### Standard Document Processing
166
+
167
+ ```python
168
+ # Standard processing with OCR and advanced tables
169
+ content = convert_document_to_markdown(
170
+ "document.pdf",
171
+ processing_preset="standard"
172
+ )
173
+ ```
174
+
175
+ #### OCR-Heavy Processing
176
+
177
+ ```python
178
+ # Heavy OCR processing for scanned documents
179
+ content = convert_document_to_markdown(
180
+ "scanned_document.pdf",
181
+ processing_preset="ocr_heavy"
182
+ )
183
+ ```
184
+
185
+ #### Table-Focused Processing
186
+
187
+ ```python
188
+ # Optimized for documents with complex tables
189
+ content = convert_document_to_markdown(
190
+ "data_heavy_document.pdf",
191
+ processing_preset="table_focused"
192
+ )
193
+ ```
194
+
195
+ #### High-Quality Processing
196
+
197
+ ```python
198
+ # Maximum quality with all features enabled
199
+ content = convert_document_to_markdown(
200
+ "important_document.pdf",
201
+ processing_preset="high_quality"
202
+ )
203
+ ```
204
+
205
+ ### Custom Configuration
206
+
207
+ ```python
208
+ from rag_knowledge_preparation import convert_document_to_markdown
209
+
210
+ # Custom configuration
211
+ content = convert_document_to_markdown(
212
+ "document.pdf",
213
+ processing_preset="standard",
214
+ ocr_engine="tesseract",
215
+ ocr_language="en",
216
+ table_confidence_threshold=0.9,
217
+ enable_cell_matching=True
218
+ )
219
+ ```
220
+
221
+ ### Batch Processing
222
+
223
+ ```python
224
+ from rag_knowledge_preparation import convert_documents_batch, convert_folder_to_markdown
225
+
226
+ # Process multiple files
227
+ results = convert_documents_batch([
228
+ "document1.pdf",
229
+ "document2.docx",
230
+ "document3.html"
231
+ ])
232
+
233
+ # Process entire folder
234
+ folder_results = convert_folder_to_markdown("./documents/")
235
+ ```
236
+
237
+ ## Codebase Analysis Usage
238
+
239
+ ### Basic Analysis
240
+
241
+ ```python
242
+ from rag_knowledge_preparation import analyze_codebase_structure
243
+
244
+ # Analyze codebase structure
245
+ structure = analyze_codebase_structure("./my_project")
246
+
247
+ print(f"Total files: {structure['total_files']}")
248
+ print(f"Total lines: {structure['total_lines']}")
249
+ print(f"Languages: {structure['languages']}")
250
+ ```
251
+
252
+ ### Export to Markdown
253
+
254
+ ```python
255
+ from rag_knowledge_preparation import export_codebase_to_markdown
256
+
257
+ # Export with default settings
258
+ output_file = export_codebase_to_markdown("./my_project")
259
+
260
+ # Export with custom output file
261
+ output_file = export_codebase_to_markdown(
262
+ "./my_project",
263
+ output_file="my_codebase.md"
264
+ )
265
+ ```
266
+
267
+ ### AI-Powered Analysis
268
+
269
+ ```python
270
+ from rag_knowledge_preparation import export_codebase_to_markdown
271
+
272
+ # Export with AI summaries (requires Gemini API key)
273
+ output_file = export_codebase_to_markdown(
274
+ "./my_project",
275
+ gemini_api_key="your-gemini-api-key",
276
+ gemini_model="gemini-pro"
277
+ )
278
+ ```
279
+
280
+ ### Codebase Processing Presets
281
+
282
+ #### Minimal Processing
283
+
284
+ ```python
285
+ from rag_knowledge_preparation import export_codebase_to_markdown
286
+
287
+ # Minimal processing - basic analysis only
288
+ output_file = export_codebase_to_markdown(
289
+ "./my_project",
290
+ processing_preset="minimal"
291
+ )
292
+ ```
293
+
294
+ #### Standard Processing
295
+
296
+ ```python
297
+ # Standard processing with full analysis
298
+ output_file = export_codebase_to_markdown(
299
+ "./my_project",
300
+ processing_preset="standard"
301
+ )
302
+ ```
303
+
304
+ #### Comprehensive Processing
305
+
306
+ ```python
307
+ # Comprehensive processing with all features
308
+ output_file = export_codebase_to_markdown(
309
+ "./my_project",
310
+ processing_preset="comprehensive"
311
+ )
312
+ ```
313
+
314
+ ### Configuration Options
315
+
316
+ ```python
317
+ from rag_knowledge_preparation import (
318
+ CodebaseProcessingConfig,
319
+ MetadataConfig,
320
+ export_codebase_to_markdown
321
+ )
322
+
323
+ # Custom configuration
324
+ config = CodebaseProcessingConfig(
325
+ max_file_size_mb=2.0,
326
+ include_test_files=False,
327
+ include_documentation=True,
328
+ enable_ai_summary=True,
329
+ gemini_api_key="your-api-key",
330
+ custom_ignore_patterns=["*.log", "temp/*"]
331
+ )
332
+
333
+ # Custom metadata configuration
334
+ metadata_config = MetadataConfig(
335
+ include_file_path=True,
336
+ include_language=True,
337
+ include_purpose=True,
338
+ include_dependencies=True,
339
+ include_structure=True,
340
+ include_summary=True
341
+ )
342
+
343
+ config.metadata_config = metadata_config
344
+
345
+ # Use custom configuration
346
+ output_file = export_codebase_to_markdown(
347
+ "./my_project",
348
+ processing_preset="custom",
349
+ **config.model_dump()
350
+ )
351
+ ```
352
+
353
+ ## Advanced Features
354
+
355
+ ### Language Detection and Classification
356
+
357
+ The library automatically detects programming languages and classifies files by purpose:
358
+
359
+ ```python
360
+ from rag_knowledge_preparation.codebase_processing.analysis import (
361
+ get_language_from_extension,
362
+ classify_file_by_purpose
363
+ )
364
+
365
+ # Detect language from file extension
366
+ language = get_language_from_extension("script.py") # Returns "python"
367
+
368
+ # Classify file by purpose
369
+ purpose = classify_file_by_purpose("test_utils.py") # Returns "Tests"
370
+ ```
371
+
372
+ ### Dependency Analysis
373
+
374
+ ```python
375
+ from pathlib import Path
376
+ from rag_knowledge_preparation.codebase_processing.analysis import analyze_file_dependencies
377
+
378
+ # Analyze dependencies in a Python file
379
+ with open("main.py", "r") as f:
380
+ content = f.read()
381
+ dependencies = analyze_file_dependencies(content, Path("main.py"), "python")
382
+
383
+ print("External packages:", dependencies["external_packages"])
384
+ print("Standard library:", dependencies["standard_library"])
385
+ print("Internal modules:", dependencies["internal_modules"])
386
+ ```
387
+
388
+ ### Code Structure Extraction
389
+
390
+ ```python
391
+ from pathlib import Path
392
+ from rag_knowledge_preparation.codebase_processing.analysis import extract_code_structure
393
+
394
+ # Extract structure from code file
395
+ code_content = """
396
+ class MyClass:
397
+ def __init__(self):
398
+ pass
399
+
400
+ def method(self):
401
+ pass
402
+ """
403
+ structure = extract_code_structure(Path("example.py"), "python", code_content)
404
+
405
+ print("Classes:", structure["classes"])
406
+ print("Functions:", structure["functions"])
407
+ ```
408
+
409
+ ### Token Estimation
410
+
411
+ ```python
412
+ from rag_knowledge_preparation.codebase_processing.analysis import estimate_token_count
413
+
414
+ # Estimate tokens in text
415
+ token_count = estimate_token_count("Hello, world!")
416
+ print(f"Estimated tokens: {token_count}")
417
+
418
+ # Estimate tokens in code
419
+ code_tokens = estimate_token_count("""
420
+ def hello():
421
+ print("Hello, world!")
422
+ """)
423
+ ```
424
+
425
+ ## Configuration Reference
426
+
427
+ ### Document Processing Configuration
428
+
429
+ | Parameter | Type | Default | Description |
430
+ |-----------|------|---------|-------------|
431
+ | `enable_ocr` | bool | True | Enable OCR processing |
432
+ | `table_processing` | str | "advanced" | Table processing mode (basic, advanced, tableformer) |
433
+ | `ocr_engine` | str | "easyocr" | OCR engine (easyocr, tesseract) |
434
+ | `ocr_language` | str | "en" | OCR language (en, fr, de, es) |
435
+ | `table_confidence_threshold` | float | 0.8 | Table detection confidence threshold |
436
+ | `enable_cell_matching` | bool | True | Enable cell matching in tables |
437
+ | `enable_table_structure` | bool | True | Enable table structure analysis |
438
+
439
+ ### Codebase Processing Configuration
440
+
441
+ | Parameter | Type | Default | Description |
442
+ |-----------|------|---------|-------------|
443
+ | `max_file_size_mb` | float | 1.0 | Maximum file size to process |
444
+ | `include_hidden_files` | bool | False | Include hidden files |
445
+ | `include_test_files` | bool | True | Include test files |
446
+ | `include_documentation` | bool | True | Include documentation files |
447
+ | `include_config_files` | bool | True | Include configuration files |
448
+ | `enable_structure_analysis` | bool | True | Enable code structure analysis |
449
+ | `enable_ai_summary` | bool | True | Enable AI-powered summaries |
450
+ | `gemini_api_key` | str | None | Google Gemini API key |
451
+ | `gemini_model` | str | "gemini-pro" | Gemini model to use |
452
+ | `custom_ignore_patterns` | List[str] | None | Custom ignore patterns |
453
+
454
+ ## Error Handling
455
+
456
+ The library provides comprehensive error handling with custom exceptions:
457
+
458
+ ```python
459
+ from rag_knowledge_preparation import (
460
+ RAGKnowledgePreparationError,
461
+ DocumentNotFoundError,
462
+ ConfigurationError,
463
+ ConversionError,
464
+ UnsupportedFormatError
465
+ )
466
+
467
+ try:
468
+ content = convert_document_to_markdown("nonexistent.pdf")
469
+ except DocumentNotFoundError as e:
470
+ print(f"Document not found: {e}")
471
+ except ConversionError as e:
472
+ print(f"Conversion failed: {e}")
473
+ except ConfigurationError as e:
474
+ print(f"Configuration error: {e}")
475
+ ```
476
+
477
+ ## Performance Considerations
478
+
479
+ ### Large File Processing
480
+
481
+ The library includes built-in optimizations for large files:
482
+
483
+ - **File Size Limits**: Configurable maximum file size limits
484
+ - **Memory Efficiency**: Streaming processing for large documents
485
+ - **Batch Processing**: Efficient processing of multiple files
486
+ - **Parallel Processing**: Concurrent processing where possible
487
+
488
+ ### Performance Modes
489
+
490
+ ```python
491
+ # Use performance-optimized settings
492
+ config = CodebaseProcessingConfig(
493
+ max_file_size_mb=0.5, # Smaller file limit
494
+ enable_ai_summary=False, # Disable AI for speed
495
+ enable_structure_analysis=False # Disable structure analysis
496
+ )
497
+ ```
498
+
499
+ ## Examples
500
+
501
+ ### Complete Document Processing Pipeline
502
+
503
+ ```python
504
+ from rag_knowledge_preparation import (
505
+ convert_folder_to_markdown,
506
+ list_document_configs
507
+ )
508
+
509
+ # List available configurations
510
+ configs = list_document_configs()
511
+ print("Available configurations:", list(configs.keys()))
512
+
513
+ # Process entire document folder
514
+ results = convert_folder_to_markdown(
515
+ "./documents/",
516
+ processing_preset="high_quality"
517
+ )
518
+
519
+ # Save results
520
+ for file_path, content in results.items():
521
+ output_path = f"processed_{file_path.split('/')[-1]}.md"
522
+ with open(output_path, 'w', encoding='utf-8') as f:
523
+ f.write(content)
524
+ ```
525
+
526
+ ### Complete Codebase Analysis Pipeline
527
+
528
+ ```python
529
+ from rag_knowledge_preparation import (
530
+ export_codebase_to_markdown,
531
+ analyze_codebase_structure,
532
+ get_codebase_overview,
533
+ list_available_codebase_configs
534
+ )
535
+
536
+ # List available configurations
537
+ configs = list_available_codebase_configs()
538
+ print("Available configurations:", list(configs.keys()))
539
+
540
+ # Get overview
541
+ overview = get_codebase_overview("./my_project")
542
+ print(f"Project: {overview['name']}")
543
+ print(f"Files: {overview['total_files']}")
544
+ print(f"Languages: {overview['languages']}")
545
+
546
+ # Analyze structure
547
+ structure = analyze_codebase_structure("./my_project")
548
+ print(f"Structure analysis complete: {structure['total_files']} files processed")
549
+
550
+ # Export to Markdown
551
+ output_file = export_codebase_to_markdown(
552
+ "./my_project",
553
+ output_file="project_analysis.md",
554
+ gemini_api_key="your-api-key"
555
+ )
556
+ print(f"Exported to: {output_file}")
557
+ ```
558
+
559
+ ## Acknowledgments
560
+
561
+ - [Docling](https://github.com/DS4SD/docling) for document processing capabilities
562
+ - [Tree-sitter](https://tree-sitter.github.io/) for code parsing
563
+ - [Google Gemini](https://ai.google.dev/) for AI-powered summarization
564
+ - [Pygments](https://pygments.org/) for syntax highlighting and language detection
565
+
566
+ ## Changelog
567
+
568
+ ### Version 1.0.0
569
+
570
+ - Initial release
571
+ - Document processing with OCR support
572
+ - Codebase analysis and export
573
+ - AI-powered summarization
574
+ - Comprehensive configuration options
575
+ - Multi-language support