smartdocloader 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smartdocloader-1.0.0/LICENSE.txt +19 -0
- smartdocloader-1.0.0/PKG-INFO +390 -0
- smartdocloader-1.0.0/README.md +344 -0
- smartdocloader-1.0.0/setup.cfg +4 -0
- smartdocloader-1.0.0/setup.py +46 -0
- smartdocloader-1.0.0/smartdocloader/__init__.py +48 -0
- smartdocloader-1.0.0/smartdocloader/doc_loader.py +192 -0
- smartdocloader-1.0.0/smartdocloader/smart_loader.py +116 -0
- smartdocloader-1.0.0/smartdocloader/text_loader.py +160 -0
- smartdocloader-1.0.0/smartdocloader/utils.py +166 -0
- smartdocloader-1.0.0/smartdocloader.egg-info/PKG-INFO +390 -0
- smartdocloader-1.0.0/smartdocloader.egg-info/SOURCES.txt +13 -0
- smartdocloader-1.0.0/smartdocloader.egg-info/dependency_links.txt +1 -0
- smartdocloader-1.0.0/smartdocloader.egg-info/requires.txt +10 -0
- smartdocloader-1.0.0/smartdocloader.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Copyright (c) 2026
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
5
|
+
in the Software without restriction, including without limitation the rights
|
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
8
|
+
furnished to do so, subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
|
11
|
+
copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
19
|
+
SOFTWARE.
|
|
@@ -0,0 +1,390 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: smartdocloader
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: An advanced universal document loader - load TXT, CSV, JSON, XML, YAML, INI, HTML, PDF, DOCX, PPTX, XLSX with smart auto-detection
|
|
5
|
+
Author: Your Name
|
|
6
|
+
Author-email: your.email@example.com
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Source, https://github.com/yourusername/smartdocloader
|
|
9
|
+
Keywords: document,loader,pdf,docx,xlsx,csv,json,yaml,parser,reader
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
21
|
+
Classifier: Topic :: Text Processing
|
|
22
|
+
Requires-Python: >=3.7
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE.txt
|
|
25
|
+
Requires-Dist: PyPDF2
|
|
26
|
+
Requires-Dist: python-docx
|
|
27
|
+
Requires-Dist: python-pptx
|
|
28
|
+
Requires-Dist: openpyxl
|
|
29
|
+
Provides-Extra: yaml
|
|
30
|
+
Requires-Dist: pyyaml; extra == "yaml"
|
|
31
|
+
Provides-Extra: all
|
|
32
|
+
Requires-Dist: pyyaml; extra == "all"
|
|
33
|
+
Dynamic: author
|
|
34
|
+
Dynamic: author-email
|
|
35
|
+
Dynamic: classifier
|
|
36
|
+
Dynamic: description
|
|
37
|
+
Dynamic: description-content-type
|
|
38
|
+
Dynamic: keywords
|
|
39
|
+
Dynamic: license
|
|
40
|
+
Dynamic: license-file
|
|
41
|
+
Dynamic: project-url
|
|
42
|
+
Dynamic: provides-extra
|
|
43
|
+
Dynamic: requires-dist
|
|
44
|
+
Dynamic: requires-python
|
|
45
|
+
Dynamic: summary
|
|
46
|
+
|
|
47
|
+
# smartdocloader
|
|
48
|
+
|
|
49
|
+
An advanced universal document loader for Python. Smart auto-detection, batch loading, content search, and support for 11+ file formats.
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install smartdocloader
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
For YAML support:
|
|
58
|
+
```bash
|
|
59
|
+
pip install smartdocloader[yaml]
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Features
|
|
63
|
+
|
|
64
|
+
- **Auto-detection**: Automatically detects file type and uses the right loader
|
|
65
|
+
- **Batch loading**: Load multiple files in one call
|
|
66
|
+
- **11 formats supported**: TXT, CSV, JSON, XML, YAML, INI, HTML, PDF, DOCX, PPTX, XLSX
|
|
67
|
+
- **Content search**: Search within loaded data
|
|
68
|
+
- **Export**: Convert loaded data to JSON
|
|
69
|
+
- **File comparison**: Compare content of two files
|
|
70
|
+
- **Advanced options**: Page ranges for PDFs, sheet selection for Excel, table extraction from Word
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## Quick Start
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from smartdocloader import auto_load
|
|
78
|
+
|
|
79
|
+
# Just pass any file - it auto-detects the format
|
|
80
|
+
data = auto_load("report.pdf")
|
|
81
|
+
data = auto_load("data.csv")
|
|
82
|
+
data = auto_load("config.yaml")
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## All Supported Formats
|
|
88
|
+
|
|
89
|
+
| Format | Extensions | Module |
|
|
90
|
+
|--------|-----------|--------|
|
|
91
|
+
| Plain Text | `.txt` | text_loader |
|
|
92
|
+
| CSV | `.csv` | text_loader |
|
|
93
|
+
| JSON | `.json` | text_loader |
|
|
94
|
+
| XML | `.xml` | text_loader |
|
|
95
|
+
| YAML | `.yaml`, `.yml` | text_loader |
|
|
96
|
+
| INI/Config | `.ini`, `.cfg` | text_loader |
|
|
97
|
+
| HTML | `.html`, `.htm` | text_loader |
|
|
98
|
+
| PDF | `.pdf` | doc_loader |
|
|
99
|
+
| Word | `.docx` | doc_loader |
|
|
100
|
+
| PowerPoint | `.pptx` | doc_loader |
|
|
101
|
+
| Excel | `.xlsx` | doc_loader |
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## Modules & Functions
|
|
106
|
+
|
|
107
|
+
### Module 1: `text_loader`
|
|
108
|
+
|
|
109
|
+
| Function | Description |
|
|
110
|
+
|----------|-------------|
|
|
111
|
+
| `load_txt(filepath, encoding)` | Load plain text file |
|
|
112
|
+
| `load_csv(filepath, delimiter, encoding)` | Load CSV as list of dicts |
|
|
113
|
+
| `load_json(filepath, encoding)` | Load and parse JSON |
|
|
114
|
+
| `load_xml(filepath)` | Load XML as nested dict |
|
|
115
|
+
| `load_yaml(filepath, encoding)` | Load YAML data |
|
|
116
|
+
| `load_ini(filepath, encoding)` | Load INI as nested dict |
|
|
117
|
+
| `load_html(filepath, encoding)` | Extract text from HTML |
|
|
118
|
+
|
|
119
|
+
### Module 2: `doc_loader`
|
|
120
|
+
|
|
121
|
+
| Function | Description |
|
|
122
|
+
|----------|-------------|
|
|
123
|
+
| `load_pdf(filepath, page_range)` | Extract text from PDF (optional page range) |
|
|
124
|
+
| `load_pdf_pages(filepath)` | Get text per page as a list |
|
|
125
|
+
| `load_pdf_metadata(filepath)` | Get PDF metadata (author, title, etc.) |
|
|
126
|
+
| `load_docx(filepath, include_tables)` | Load Word document paragraphs |
|
|
127
|
+
| `load_docx_with_styles(filepath)` | Load with style/formatting info |
|
|
128
|
+
| `load_pptx(filepath, include_notes)` | Load PowerPoint slides |
|
|
129
|
+
| `load_xlsx(filepath, sheet_name)` | Load Excel data from specific sheet |
|
|
130
|
+
| `load_xlsx_sheets(filepath)` | List all sheet names |
|
|
131
|
+
|
|
132
|
+
### Module 3: `smart_loader`
|
|
133
|
+
|
|
134
|
+
| Function | Description |
|
|
135
|
+
|----------|-------------|
|
|
136
|
+
| `auto_load(filepath)` | Auto-detect format and load |
|
|
137
|
+
| `batch_load(filepaths)` | Load multiple files at once |
|
|
138
|
+
| `get_file_info(filepath)` | Get file metadata (size, type, etc.) |
|
|
139
|
+
| `supported_formats()` | List all supported extensions |
|
|
140
|
+
|
|
141
|
+
### Module 4: `utils`
|
|
142
|
+
|
|
143
|
+
| Function | Description |
|
|
144
|
+
|----------|-------------|
|
|
145
|
+
| `search_content(data, keyword)` | Search within loaded content |
|
|
146
|
+
| `convert_to_text(data)` | Convert any loaded data to plain text |
|
|
147
|
+
| `word_count(data)` | Count words in loaded content |
|
|
148
|
+
| `export_to_json(data, output_path)` | Export loaded data to JSON file |
|
|
149
|
+
| `compare_files(filepath1, filepath2)` | Compare two files |
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
## Usage Examples
|
|
154
|
+
|
|
155
|
+
### Auto-Loading (Smart Detection)
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
from smartdocloader import auto_load
|
|
159
|
+
|
|
160
|
+
# Just pass any file path - format is auto-detected
|
|
161
|
+
pdf_content = auto_load("report.pdf")
|
|
162
|
+
csv_data = auto_load("students.csv")
|
|
163
|
+
config = auto_load("settings.yaml")
|
|
164
|
+
|
|
165
|
+
print(pdf_content[:100])
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### Batch Loading Multiple Files
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
from smartdocloader import batch_load
|
|
172
|
+
|
|
173
|
+
files = ["data.csv", "report.pdf", "config.json", "notes.txt"]
|
|
174
|
+
results = batch_load(files)
|
|
175
|
+
|
|
176
|
+
for filepath, content in results.items():
|
|
177
|
+
if "error" in content if isinstance(content, dict) else False:
|
|
178
|
+
print(f"Failed: {filepath} - {content['error']}")
|
|
179
|
+
else:
|
|
180
|
+
print(f"Loaded: {filepath}")
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
### Loading PDFs with Options
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
from smartdocloader import load_pdf, load_pdf_pages, load_pdf_metadata
|
|
187
|
+
|
|
188
|
+
# Load entire PDF
|
|
189
|
+
full_text = load_pdf("book.pdf")
|
|
190
|
+
|
|
191
|
+
# Load only pages 0-4 (first 5 pages)
|
|
192
|
+
intro = load_pdf("book.pdf", page_range=(0, 5))
|
|
193
|
+
|
|
194
|
+
# Get text per page
|
|
195
|
+
pages = load_pdf_pages("book.pdf")
|
|
196
|
+
print(f"Page 1: {pages[0][:100]}")
|
|
197
|
+
print(f"Total pages: {len(pages)}")
|
|
198
|
+
|
|
199
|
+
# Get metadata
|
|
200
|
+
meta = load_pdf_metadata("book.pdf")
|
|
201
|
+
print(f"Author: {meta['author']}")
|
|
202
|
+
print(f"Title: {meta['title']}")
|
|
203
|
+
print(f"Pages: {meta['page_count']}")
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
### Loading Word Documents
|
|
207
|
+
|
|
208
|
+
```python
|
|
209
|
+
from smartdocloader import load_docx, load_docx_with_styles
|
|
210
|
+
|
|
211
|
+
# Basic loading
|
|
212
|
+
paragraphs = load_docx("report.docx")
|
|
213
|
+
for p in paragraphs:
|
|
214
|
+
print(p)
|
|
215
|
+
|
|
216
|
+
# With tables included
|
|
217
|
+
content = load_docx("report.docx", include_tables=True)
|
|
218
|
+
for item in content:
|
|
219
|
+
print(item)
|
|
220
|
+
|
|
221
|
+
# With style information
|
|
222
|
+
styled = load_docx_with_styles("report.docx")
|
|
223
|
+
for para in styled:
|
|
224
|
+
if para["bold"]:
|
|
225
|
+
print(f"[BOLD] {para['text']}")
|
|
226
|
+
else:
|
|
227
|
+
print(f" {para['text']}")
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
### Loading Excel with Sheet Selection
|
|
231
|
+
|
|
232
|
+
```python
|
|
233
|
+
from smartdocloader import load_xlsx, load_xlsx_sheets
|
|
234
|
+
|
|
235
|
+
# See available sheets
|
|
236
|
+
sheets = load_xlsx_sheets("financials.xlsx")
|
|
237
|
+
print(f"Sheets: {sheets}")
|
|
238
|
+
|
|
239
|
+
# Load specific sheet
|
|
240
|
+
q1_data = load_xlsx("financials.xlsx", sheet_name="Q1")
|
|
241
|
+
for row in q1_data:
|
|
242
|
+
print(row)
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
### Loading PowerPoint with Notes
|
|
246
|
+
|
|
247
|
+
```python
|
|
248
|
+
from smartdocloader import load_pptx
|
|
249
|
+
|
|
250
|
+
slides = load_pptx("lecture.pptx", include_notes=True)
|
|
251
|
+
for slide in slides:
|
|
252
|
+
print(f"--- Slide {slide['slide_number']} ---")
|
|
253
|
+
for text in slide["text"]:
|
|
254
|
+
print(f" {text}")
|
|
255
|
+
if "notes" in slide and slide["notes"]:
|
|
256
|
+
print(f" [Notes: {slide['notes']}]")
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
### Loading YAML Configuration
|
|
260
|
+
|
|
261
|
+
```python
|
|
262
|
+
from smartdocloader import load_yaml
|
|
263
|
+
|
|
264
|
+
config = load_yaml("docker-compose.yml")
|
|
265
|
+
print(config["services"])
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
### Loading INI/Config Files
|
|
269
|
+
|
|
270
|
+
```python
|
|
271
|
+
from smartdocloader import load_ini
|
|
272
|
+
|
|
273
|
+
settings = load_ini("app.ini")
|
|
274
|
+
print(settings["database"]["host"])
|
|
275
|
+
print(settings["database"]["port"])
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
### Loading HTML (Text Extraction)
|
|
279
|
+
|
|
280
|
+
```python
|
|
281
|
+
from smartdocloader import load_html
|
|
282
|
+
|
|
283
|
+
text = load_html("page.html")
|
|
284
|
+
print(text) # Clean text without HTML tags
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
### Searching Within Loaded Content
|
|
288
|
+
|
|
289
|
+
```python
|
|
290
|
+
from smartdocloader import auto_load, search_content
|
|
291
|
+
|
|
292
|
+
# Load any file
|
|
293
|
+
data = auto_load("students.csv")
|
|
294
|
+
|
|
295
|
+
# Search for a keyword
|
|
296
|
+
matches = search_content(data, "Ahmed")
|
|
297
|
+
print(f"Found {len(matches)} matches:")
|
|
298
|
+
for match in matches:
|
|
299
|
+
print(f" {match}")
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
### Converting to Plain Text
|
|
303
|
+
|
|
304
|
+
```python
|
|
305
|
+
from smartdocloader import auto_load, convert_to_text
|
|
306
|
+
|
|
307
|
+
# Load structured data
|
|
308
|
+
data = auto_load("grades.xlsx")
|
|
309
|
+
|
|
310
|
+
# Convert to flat text
|
|
311
|
+
text = convert_to_text(data)
|
|
312
|
+
print(text)
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
### Exporting to JSON
|
|
316
|
+
|
|
317
|
+
```python
|
|
318
|
+
from smartdocloader import auto_load, export_to_json
|
|
319
|
+
|
|
320
|
+
# Load a Word document
|
|
321
|
+
data = auto_load("report.docx")
|
|
322
|
+
|
|
323
|
+
# Export as JSON for further processing
|
|
324
|
+
export_to_json(data, "report_output.json")
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
### Comparing Two Files
|
|
328
|
+
|
|
329
|
+
```python
|
|
330
|
+
from smartdocloader import compare_files
|
|
331
|
+
|
|
332
|
+
result = compare_files("version1.txt", "version2.txt")
|
|
333
|
+
print(f"Identical: {result['identical']}")
|
|
334
|
+
print(f"File 1: {result['file1_lines']} lines, {result['file1_words']} words")
|
|
335
|
+
print(f"File 2: {result['file2_lines']} lines, {result['file2_words']} words")
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
### Getting File Info
|
|
339
|
+
|
|
340
|
+
```python
|
|
341
|
+
from smartdocloader import get_file_info
|
|
342
|
+
|
|
343
|
+
info = get_file_info("report.pdf")
|
|
344
|
+
print(f"Name: {info['name']}")
|
|
345
|
+
print(f"Size: {info['size_readable']}")
|
|
346
|
+
print(f"Supported: {info['is_supported']}")
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
### Listing Supported Formats
|
|
350
|
+
|
|
351
|
+
```python
|
|
352
|
+
from smartdocloader import supported_formats
|
|
353
|
+
|
|
354
|
+
formats = supported_formats()
|
|
355
|
+
print(f"Supported: {', '.join(formats)}")
|
|
356
|
+
```
|
|
357
|
+
|
|
358
|
+
---
|
|
359
|
+
|
|
360
|
+
## Error Handling
|
|
361
|
+
|
|
362
|
+
```python
|
|
363
|
+
from smartdocloader import auto_load
|
|
364
|
+
|
|
365
|
+
try:
|
|
366
|
+
data = auto_load("unknown.xyz")
|
|
367
|
+
except ValueError as e:
|
|
368
|
+
print(f"Format error: {e}")
|
|
369
|
+
except FileNotFoundError as e:
|
|
370
|
+
print(f"File missing: {e}")
|
|
371
|
+
except Exception as e:
|
|
372
|
+
print(f"Error: {e}")
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
---
|
|
376
|
+
|
|
377
|
+
## Requirements
|
|
378
|
+
|
|
379
|
+
- Python >= 3.7
|
|
380
|
+
- PyPDF2
|
|
381
|
+
- python-docx
|
|
382
|
+
- python-pptx
|
|
383
|
+
- openpyxl
|
|
384
|
+
- pyyaml (optional, for YAML support)
|
|
385
|
+
|
|
386
|
+
---
|
|
387
|
+
|
|
388
|
+
## License
|
|
389
|
+
|
|
390
|
+
MIT
|