smartdocloader 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smartdocloader/__init__.py +48 -0
- smartdocloader/doc_loader.py +192 -0
- smartdocloader/smart_loader.py +116 -0
- smartdocloader/text_loader.py +160 -0
- smartdocloader/utils.py +166 -0
- smartdocloader-1.0.0.dist-info/METADATA +390 -0
- smartdocloader-1.0.0.dist-info/RECORD +10 -0
- smartdocloader-1.0.0.dist-info/WHEEL +5 -0
- smartdocloader-1.0.0.dist-info/licenses/LICENSE.txt +19 -0
- smartdocloader-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""
|
|
2
|
+
smartdocloader - An advanced universal document loader library.
|
|
3
|
+
Load content from multiple file formats with a single smart API.
|
|
4
|
+
Supports: TXT, CSV, JSON, XML, YAML, INI, HTML, PDF, DOCX, PPTX, XLSX
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .text_loader import load_txt, load_csv, load_json, load_xml, load_yaml, load_ini, load_html
|
|
8
|
+
from .doc_loader import (
|
|
9
|
+
load_pdf, load_pdf_pages, load_pdf_metadata,
|
|
10
|
+
load_docx, load_docx_with_styles,
|
|
11
|
+
load_pptx,
|
|
12
|
+
load_xlsx, load_xlsx_sheets,
|
|
13
|
+
)
|
|
14
|
+
from .smart_loader import auto_load, batch_load, get_file_info, supported_formats
|
|
15
|
+
from .utils import search_content, convert_to_text, word_count, export_to_json, compare_files
|
|
16
|
+
|
|
17
|
+
__version__ = "1.0.0"
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
# text_loader
|
|
21
|
+
"load_txt",
|
|
22
|
+
"load_csv",
|
|
23
|
+
"load_json",
|
|
24
|
+
"load_xml",
|
|
25
|
+
"load_yaml",
|
|
26
|
+
"load_ini",
|
|
27
|
+
"load_html",
|
|
28
|
+
# doc_loader
|
|
29
|
+
"load_pdf",
|
|
30
|
+
"load_pdf_pages",
|
|
31
|
+
"load_pdf_metadata",
|
|
32
|
+
"load_docx",
|
|
33
|
+
"load_docx_with_styles",
|
|
34
|
+
"load_pptx",
|
|
35
|
+
"load_xlsx",
|
|
36
|
+
"load_xlsx_sheets",
|
|
37
|
+
# smart_loader
|
|
38
|
+
"auto_load",
|
|
39
|
+
"batch_load",
|
|
40
|
+
"get_file_info",
|
|
41
|
+
"supported_formats",
|
|
42
|
+
# utils
|
|
43
|
+
"search_content",
|
|
44
|
+
"convert_to_text",
|
|
45
|
+
"word_count",
|
|
46
|
+
"export_to_json",
|
|
47
|
+
"compare_files",
|
|
48
|
+
]
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""
|
|
2
|
+
doc_loader module - Load common document formats.
|
|
3
|
+
Supports: .pdf, .docx, .pptx, .xlsx with advanced options
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import openpyxl
|
|
7
|
+
from docx import Document as DocxDocument
|
|
8
|
+
from pptx import Presentation
|
|
9
|
+
from PyPDF2 import PdfReader
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def load_pdf(filepath, page_range=None):
|
|
13
|
+
"""Load a .pdf file and return all text content as a string.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
filepath: Path to the .pdf file
|
|
17
|
+
page_range: Optional tuple (start, end) for specific pages (0-indexed).
|
|
18
|
+
If None, loads all pages.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
str: Extracted text from specified pages joined with newlines
|
|
22
|
+
"""
|
|
23
|
+
reader = PdfReader(filepath)
|
|
24
|
+
pages = reader.pages
|
|
25
|
+
|
|
26
|
+
if page_range:
|
|
27
|
+
start, end = page_range
|
|
28
|
+
pages = pages[start:end]
|
|
29
|
+
|
|
30
|
+
texts = []
|
|
31
|
+
for page in pages:
|
|
32
|
+
text = page.extract_text()
|
|
33
|
+
if text:
|
|
34
|
+
texts.append(text)
|
|
35
|
+
return "\n".join(texts)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def load_pdf_pages(filepath):
|
|
39
|
+
"""Load a .pdf file and return text per page as a list.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
filepath: Path to the .pdf file
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
list[str]: List where each element is the text of one page
|
|
46
|
+
"""
|
|
47
|
+
reader = PdfReader(filepath)
|
|
48
|
+
pages = []
|
|
49
|
+
for page in reader.pages:
|
|
50
|
+
text = page.extract_text()
|
|
51
|
+
pages.append(text if text else "")
|
|
52
|
+
return pages
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def load_pdf_metadata(filepath):
|
|
56
|
+
"""Extract metadata from a PDF file.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
filepath: Path to the .pdf file
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
dict: Metadata including title, author, subject, page_count, etc.
|
|
63
|
+
"""
|
|
64
|
+
reader = PdfReader(filepath)
|
|
65
|
+
meta = reader.metadata
|
|
66
|
+
info = {
|
|
67
|
+
"title": meta.title if meta else None,
|
|
68
|
+
"author": meta.author if meta else None,
|
|
69
|
+
"subject": meta.subject if meta else None,
|
|
70
|
+
"creator": meta.creator if meta else None,
|
|
71
|
+
"producer": meta.producer if meta else None,
|
|
72
|
+
"page_count": len(reader.pages),
|
|
73
|
+
}
|
|
74
|
+
return info
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def load_docx(filepath, include_tables=False):
|
|
78
|
+
"""Load a .docx file and return all paragraph text as a list of strings.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
filepath: Path to the .docx file
|
|
82
|
+
include_tables: If True, also extract text from tables
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
list[str]: List of paragraph texts (empty paragraphs skipped).
|
|
86
|
+
If include_tables=True, table rows are appended as dicts.
|
|
87
|
+
"""
|
|
88
|
+
doc = DocxDocument(filepath)
|
|
89
|
+
result = [para.text for para in doc.paragraphs if para.text.strip()]
|
|
90
|
+
|
|
91
|
+
if include_tables:
|
|
92
|
+
for table in doc.tables:
|
|
93
|
+
headers = [cell.text.strip() for cell in table.rows[0].cells]
|
|
94
|
+
for row in table.rows[1:]:
|
|
95
|
+
row_data = {headers[i]: cell.text.strip() for i, cell in enumerate(row.cells)}
|
|
96
|
+
result.append(row_data)
|
|
97
|
+
|
|
98
|
+
return result
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def load_docx_with_styles(filepath):
|
|
102
|
+
"""Load a .docx file and return paragraphs with their style info.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
filepath: Path to the .docx file
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
list[dict]: Each dict has 'text', 'style', and 'bold' keys
|
|
109
|
+
"""
|
|
110
|
+
doc = DocxDocument(filepath)
|
|
111
|
+
result = []
|
|
112
|
+
for para in doc.paragraphs:
|
|
113
|
+
if para.text.strip():
|
|
114
|
+
result.append({
|
|
115
|
+
"text": para.text,
|
|
116
|
+
"style": para.style.name if para.style else None,
|
|
117
|
+
"bold": any(run.bold for run in para.runs),
|
|
118
|
+
})
|
|
119
|
+
return result
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def load_pptx(filepath, include_notes=False):
|
|
123
|
+
"""Load a .pptx file and return slide text as a list of lists.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
filepath: Path to the .pptx file
|
|
127
|
+
include_notes: If True, include speaker notes for each slide
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
list[dict]: Each dict has 'slide_number', 'text' (list), and optionally 'notes'
|
|
131
|
+
"""
|
|
132
|
+
prs = Presentation(filepath)
|
|
133
|
+
slides = []
|
|
134
|
+
for i, slide in enumerate(prs.slides, 1):
|
|
135
|
+
slide_text = []
|
|
136
|
+
for shape in slide.shapes:
|
|
137
|
+
if shape.has_text_frame:
|
|
138
|
+
for paragraph in shape.text_frame.paragraphs:
|
|
139
|
+
text = paragraph.text.strip()
|
|
140
|
+
if text:
|
|
141
|
+
slide_text.append(text)
|
|
142
|
+
|
|
143
|
+
slide_data = {"slide_number": i, "text": slide_text}
|
|
144
|
+
|
|
145
|
+
if include_notes and slide.has_notes_slide:
|
|
146
|
+
notes_frame = slide.notes_slide.notes_text_frame
|
|
147
|
+
slide_data["notes"] = notes_frame.text.strip() if notes_frame else ""
|
|
148
|
+
|
|
149
|
+
slides.append(slide_data)
|
|
150
|
+
return slides
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def load_xlsx(filepath, sheet_name=None):
|
|
154
|
+
"""Load a .xlsx file and return data as a list of dictionaries.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
filepath: Path to the .xlsx file
|
|
158
|
+
sheet_name: Optional sheet name to load. If None, loads the active sheet.
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
list[dict]: Rows as dictionaries with first row as headers
|
|
162
|
+
"""
|
|
163
|
+
wb = openpyxl.load_workbook(filepath, read_only=True)
|
|
164
|
+
|
|
165
|
+
if sheet_name:
|
|
166
|
+
ws = wb[sheet_name]
|
|
167
|
+
else:
|
|
168
|
+
ws = wb.active
|
|
169
|
+
|
|
170
|
+
rows = list(ws.iter_rows(values_only=True))
|
|
171
|
+
wb.close()
|
|
172
|
+
|
|
173
|
+
if not rows:
|
|
174
|
+
return []
|
|
175
|
+
|
|
176
|
+
headers = [str(h) if h is not None else f"col_{i}" for i, h in enumerate(rows[0])]
|
|
177
|
+
return [dict(zip(headers, row)) for row in rows[1:]]
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def load_xlsx_sheets(filepath):
|
|
181
|
+
"""Get all sheet names from an Excel file.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
filepath: Path to the .xlsx file
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
list[str]: List of sheet names
|
|
188
|
+
"""
|
|
189
|
+
wb = openpyxl.load_workbook(filepath, read_only=True)
|
|
190
|
+
sheets = wb.sheetnames
|
|
191
|
+
wb.close()
|
|
192
|
+
return sheets
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""
|
|
2
|
+
smart_loader module - Unified interface for loading any supported document.
|
|
3
|
+
Auto-detects file type and dispatches to the appropriate loader.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
from . import text_loader, doc_loader
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Map of file extensions to loader functions
|
|
11
|
+
_LOADERS = {
|
|
12
|
+
".txt": text_loader.load_txt,
|
|
13
|
+
".csv": text_loader.load_csv,
|
|
14
|
+
".json": text_loader.load_json,
|
|
15
|
+
".xml": text_loader.load_xml,
|
|
16
|
+
".yaml": text_loader.load_yaml,
|
|
17
|
+
".yml": text_loader.load_yaml,
|
|
18
|
+
".ini": text_loader.load_ini,
|
|
19
|
+
".cfg": text_loader.load_ini,
|
|
20
|
+
".html": text_loader.load_html,
|
|
21
|
+
".htm": text_loader.load_html,
|
|
22
|
+
".pdf": doc_loader.load_pdf,
|
|
23
|
+
".docx": doc_loader.load_docx,
|
|
24
|
+
".pptx": doc_loader.load_pptx,
|
|
25
|
+
".xlsx": doc_loader.load_xlsx,
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def auto_load(filepath):
|
|
30
|
+
"""Automatically detect file type and load using the appropriate loader.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
filepath: Path to any supported file
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Loaded content (type depends on file format)
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
ValueError: If the file format is not supported
|
|
40
|
+
FileNotFoundError: If the file does not exist
|
|
41
|
+
"""
|
|
42
|
+
if not os.path.exists(filepath):
|
|
43
|
+
raise FileNotFoundError(f"File not found: {filepath}")
|
|
44
|
+
|
|
45
|
+
ext = os.path.splitext(filepath)[1].lower()
|
|
46
|
+
|
|
47
|
+
if ext not in _LOADERS:
|
|
48
|
+
raise ValueError(
|
|
49
|
+
f"Unsupported file format: '{ext}'. "
|
|
50
|
+
f"Supported formats: {', '.join(sorted(_LOADERS.keys()))}"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
return _LOADERS[ext](filepath)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def batch_load(filepaths):
|
|
57
|
+
"""Load multiple files at once using auto-detection.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
filepaths: List of file paths to load
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
dict: Mapping of filepath -> loaded content.
|
|
64
|
+
If a file fails, its value will be a dict with 'error' key.
|
|
65
|
+
"""
|
|
66
|
+
results = {}
|
|
67
|
+
for fp in filepaths:
|
|
68
|
+
try:
|
|
69
|
+
results[fp] = auto_load(fp)
|
|
70
|
+
except Exception as e:
|
|
71
|
+
results[fp] = {"error": str(e)}
|
|
72
|
+
return results
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_file_info(filepath):
|
|
76
|
+
"""Get detailed file metadata.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
filepath: Path to any file
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
dict: File info including name, extension, size, modified time, and whether it's supported
|
|
83
|
+
"""
|
|
84
|
+
if not os.path.exists(filepath):
|
|
85
|
+
raise FileNotFoundError(f"File not found: {filepath}")
|
|
86
|
+
|
|
87
|
+
stat = os.stat(filepath)
|
|
88
|
+
ext = os.path.splitext(filepath)[1].lower()
|
|
89
|
+
|
|
90
|
+
return {
|
|
91
|
+
"name": os.path.basename(filepath),
|
|
92
|
+
"extension": ext,
|
|
93
|
+
"size_bytes": stat.st_size,
|
|
94
|
+
"size_readable": _format_size(stat.st_size),
|
|
95
|
+
"modified_timestamp": stat.st_mtime,
|
|
96
|
+
"is_supported": ext in _LOADERS,
|
|
97
|
+
"absolute_path": os.path.abspath(filepath),
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def supported_formats():
|
|
102
|
+
"""Get a list of all supported file formats.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
list[str]: Sorted list of supported file extensions
|
|
106
|
+
"""
|
|
107
|
+
return sorted(set(_LOADERS.keys()))
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _format_size(size_bytes):
|
|
111
|
+
"""Convert bytes to human-readable size string."""
|
|
112
|
+
for unit in ["B", "KB", "MB", "GB"]:
|
|
113
|
+
if size_bytes < 1024:
|
|
114
|
+
return f"{size_bytes:.1f} {unit}"
|
|
115
|
+
size_bytes /= 1024
|
|
116
|
+
return f"{size_bytes:.1f} TB"
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""
|
|
2
|
+
text_loader module - Load text-based document formats.
|
|
3
|
+
Supports: .txt, .csv, .json, .xml, .yaml/.yml, .ini, .html
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import csv
|
|
7
|
+
import json
|
|
8
|
+
import configparser
|
|
9
|
+
import xml.etree.ElementTree as ET
|
|
10
|
+
from html.parser import HTMLParser
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def load_txt(filepath, encoding="utf-8"):
|
|
14
|
+
"""Load a .txt file and return its content as a string.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
filepath: Path to the .txt file
|
|
18
|
+
encoding: File encoding (default: utf-8)
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
str: Full text content
|
|
22
|
+
"""
|
|
23
|
+
with open(filepath, "r", encoding=encoding) as f:
|
|
24
|
+
return f.read()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def load_csv(filepath, delimiter=",", encoding="utf-8"):
|
|
28
|
+
"""Load a .csv file and return its content as a list of dictionaries.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
filepath: Path to the .csv file
|
|
32
|
+
delimiter: Column separator (default: comma)
|
|
33
|
+
encoding: File encoding (default: utf-8)
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
list[dict]: List of row dictionaries with headers as keys
|
|
37
|
+
"""
|
|
38
|
+
with open(filepath, "r", encoding=encoding, newline="") as f:
|
|
39
|
+
reader = csv.DictReader(f, delimiter=delimiter)
|
|
40
|
+
return list(reader)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def load_json(filepath, encoding="utf-8"):
|
|
44
|
+
"""Load a .json file and return parsed data.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
filepath: Path to the .json file
|
|
48
|
+
encoding: File encoding (default: utf-8)
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
dict or list: Parsed JSON data
|
|
52
|
+
"""
|
|
53
|
+
with open(filepath, "r", encoding=encoding) as f:
|
|
54
|
+
return json.load(f)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def load_xml(filepath):
|
|
58
|
+
"""Load a .xml file and return a nested dictionary representation.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
filepath: Path to the .xml file
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
dict: Nested dictionary with tag, attributes, text, and children
|
|
65
|
+
"""
|
|
66
|
+
tree = ET.parse(filepath)
|
|
67
|
+
root = tree.getroot()
|
|
68
|
+
|
|
69
|
+
def element_to_dict(element):
|
|
70
|
+
result = {"tag": element.tag}
|
|
71
|
+
if element.attrib:
|
|
72
|
+
result["attributes"] = dict(element.attrib)
|
|
73
|
+
if element.text and element.text.strip():
|
|
74
|
+
result["text"] = element.text.strip()
|
|
75
|
+
children = [element_to_dict(child) for child in element]
|
|
76
|
+
if children:
|
|
77
|
+
result["children"] = children
|
|
78
|
+
return result
|
|
79
|
+
|
|
80
|
+
return element_to_dict(root)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def load_yaml(filepath, encoding="utf-8"):
|
|
84
|
+
"""Load a .yaml/.yml file and return parsed data.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
filepath: Path to the .yaml or .yml file
|
|
88
|
+
encoding: File encoding (default: utf-8)
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
dict or list: Parsed YAML data
|
|
92
|
+
|
|
93
|
+
Raises:
|
|
94
|
+
ImportError: If PyYAML is not installed
|
|
95
|
+
"""
|
|
96
|
+
try:
|
|
97
|
+
import yaml
|
|
98
|
+
except ImportError:
|
|
99
|
+
raise ImportError("PyYAML is required for YAML support. Install it with: pip install pyyaml")
|
|
100
|
+
|
|
101
|
+
with open(filepath, "r", encoding=encoding) as f:
|
|
102
|
+
return yaml.safe_load(f)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def load_ini(filepath, encoding="utf-8"):
|
|
106
|
+
"""Load a .ini/.cfg file and return as a nested dictionary.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
filepath: Path to the .ini file
|
|
110
|
+
encoding: File encoding (default: utf-8)
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
dict: Nested dict where keys are sections and values are dicts of key-value pairs
|
|
114
|
+
"""
|
|
115
|
+
config = configparser.ConfigParser()
|
|
116
|
+
config.read(filepath, encoding=encoding)
|
|
117
|
+
return {section: dict(config[section]) for section in config.sections()}
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def load_html(filepath, encoding="utf-8"):
|
|
121
|
+
"""Load an .html file and extract visible text content (strips tags).
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
filepath: Path to the .html file
|
|
125
|
+
encoding: File encoding (default: utf-8)
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
str: Extracted text content without HTML tags
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
class _HTMLTextExtractor(HTMLParser):
|
|
132
|
+
def __init__(self):
|
|
133
|
+
super().__init__()
|
|
134
|
+
self._texts = []
|
|
135
|
+
self._skip_tags = {"script", "style"}
|
|
136
|
+
self._skip = False
|
|
137
|
+
|
|
138
|
+
def handle_starttag(self, tag, attrs):
|
|
139
|
+
if tag.lower() in self._skip_tags:
|
|
140
|
+
self._skip = True
|
|
141
|
+
|
|
142
|
+
def handle_endtag(self, tag):
|
|
143
|
+
if tag.lower() in self._skip_tags:
|
|
144
|
+
self._skip = False
|
|
145
|
+
|
|
146
|
+
def handle_data(self, data):
|
|
147
|
+
if not self._skip:
|
|
148
|
+
text = data.strip()
|
|
149
|
+
if text:
|
|
150
|
+
self._texts.append(text)
|
|
151
|
+
|
|
152
|
+
def get_text(self):
|
|
153
|
+
return "\n".join(self._texts)
|
|
154
|
+
|
|
155
|
+
with open(filepath, "r", encoding=encoding) as f:
|
|
156
|
+
content = f.read()
|
|
157
|
+
|
|
158
|
+
extractor = _HTMLTextExtractor()
|
|
159
|
+
extractor.feed(content)
|
|
160
|
+
return extractor.get_text()
|
smartdocloader/utils.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""
|
|
2
|
+
utils module - Utility functions for content processing and search.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import json
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def search_content(data, keyword, case_sensitive=False):
|
|
10
|
+
"""Search for a keyword within loaded content.
|
|
11
|
+
|
|
12
|
+
Works with strings, lists, and dictionaries returned by loader functions.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
data: Loaded content (str, list, or dict)
|
|
16
|
+
keyword: Text to search for
|
|
17
|
+
case_sensitive: Whether search is case-sensitive (default: False)
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
list: Matching items/lines containing the keyword
|
|
21
|
+
"""
|
|
22
|
+
if not case_sensitive:
|
|
23
|
+
keyword = keyword.lower()
|
|
24
|
+
|
|
25
|
+
matches = []
|
|
26
|
+
|
|
27
|
+
if isinstance(data, str):
|
|
28
|
+
for line in data.splitlines():
|
|
29
|
+
target = line if case_sensitive else line.lower()
|
|
30
|
+
if keyword in target:
|
|
31
|
+
matches.append(line)
|
|
32
|
+
|
|
33
|
+
elif isinstance(data, list):
|
|
34
|
+
for item in data:
|
|
35
|
+
if isinstance(item, str):
|
|
36
|
+
target = item if case_sensitive else item.lower()
|
|
37
|
+
if keyword in target:
|
|
38
|
+
matches.append(item)
|
|
39
|
+
elif isinstance(item, dict):
|
|
40
|
+
for value in item.values():
|
|
41
|
+
target = str(value) if case_sensitive else str(value).lower()
|
|
42
|
+
if keyword in target:
|
|
43
|
+
matches.append(item)
|
|
44
|
+
break
|
|
45
|
+
|
|
46
|
+
elif isinstance(data, dict):
|
|
47
|
+
for key, value in data.items():
|
|
48
|
+
target = str(value) if case_sensitive else str(value).lower()
|
|
49
|
+
if keyword in target:
|
|
50
|
+
matches.append({key: value})
|
|
51
|
+
|
|
52
|
+
return matches
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def convert_to_text(data):
|
|
56
|
+
"""Convert any loaded data to a plain text string.
|
|
57
|
+
|
|
58
|
+
Useful for standardizing output from different loaders into a single text format.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
data: Loaded content (str, list, dict, or nested structures)
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
str: Plain text representation of the data
|
|
65
|
+
"""
|
|
66
|
+
if isinstance(data, str):
|
|
67
|
+
return data
|
|
68
|
+
|
|
69
|
+
if isinstance(data, list):
|
|
70
|
+
lines = []
|
|
71
|
+
for item in data:
|
|
72
|
+
if isinstance(item, str):
|
|
73
|
+
lines.append(item)
|
|
74
|
+
elif isinstance(item, dict):
|
|
75
|
+
lines.append(" | ".join(f"{k}: {v}" for k, v in item.items()))
|
|
76
|
+
elif isinstance(item, list):
|
|
77
|
+
lines.append(" | ".join(str(x) for x in item))
|
|
78
|
+
else:
|
|
79
|
+
lines.append(str(item))
|
|
80
|
+
return "\n".join(lines)
|
|
81
|
+
|
|
82
|
+
if isinstance(data, dict):
|
|
83
|
+
lines = []
|
|
84
|
+
for key, value in data.items():
|
|
85
|
+
if isinstance(value, dict):
|
|
86
|
+
lines.append(f"[{key}]")
|
|
87
|
+
for k, v in value.items():
|
|
88
|
+
lines.append(f" {k}: {v}")
|
|
89
|
+
else:
|
|
90
|
+
lines.append(f"{key}: {value}")
|
|
91
|
+
return "\n".join(lines)
|
|
92
|
+
|
|
93
|
+
return str(data)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def word_count(data):
|
|
97
|
+
"""Count words in loaded content.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
data: Loaded content (str, list, or dict)
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
int: Total word count
|
|
104
|
+
"""
|
|
105
|
+
text = convert_to_text(data)
|
|
106
|
+
return len(text.split())
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def export_to_json(data, output_path, indent=2):
|
|
110
|
+
"""Export any loaded data to a JSON file.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
data: Loaded content to export
|
|
114
|
+
output_path: Path for the output JSON file
|
|
115
|
+
indent: JSON indentation level (default: 2)
|
|
116
|
+
"""
|
|
117
|
+
# Convert non-serializable types
|
|
118
|
+
def make_serializable(obj):
|
|
119
|
+
if isinstance(obj, (str, int, float, bool, type(None))):
|
|
120
|
+
return obj
|
|
121
|
+
if isinstance(obj, dict):
|
|
122
|
+
return {str(k): make_serializable(v) for k, v in obj.items()}
|
|
123
|
+
if isinstance(obj, (list, tuple)):
|
|
124
|
+
return [make_serializable(item) for item in obj]
|
|
125
|
+
return str(obj)
|
|
126
|
+
|
|
127
|
+
serializable = make_serializable(data)
|
|
128
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
129
|
+
json.dump(serializable, f, indent=indent, ensure_ascii=False)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def compare_files(filepath1, filepath2):
|
|
133
|
+
"""Compare two files and return differences summary.
|
|
134
|
+
|
|
135
|
+
Both files are loaded using auto_load, then compared.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
filepath1: Path to first file
|
|
139
|
+
filepath2: Path to second file
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
dict: Comparison result with 'identical' flag and details
|
|
143
|
+
"""
|
|
144
|
+
from .smart_loader import auto_load
|
|
145
|
+
|
|
146
|
+
data1 = auto_load(filepath1)
|
|
147
|
+
data2 = auto_load(filepath2)
|
|
148
|
+
|
|
149
|
+
text1 = convert_to_text(data1)
|
|
150
|
+
text2 = convert_to_text(data2)
|
|
151
|
+
|
|
152
|
+
lines1 = text1.splitlines()
|
|
153
|
+
lines2 = text2.splitlines()
|
|
154
|
+
|
|
155
|
+
only_in_first = [l for l in lines1 if l not in lines2]
|
|
156
|
+
only_in_second = [l for l in lines2 if l not in lines1]
|
|
157
|
+
|
|
158
|
+
return {
|
|
159
|
+
"identical": text1 == text2,
|
|
160
|
+
"file1_lines": len(lines1),
|
|
161
|
+
"file2_lines": len(lines2),
|
|
162
|
+
"file1_words": len(text1.split()),
|
|
163
|
+
"file2_words": len(text2.split()),
|
|
164
|
+
"only_in_file1": only_in_first[:20], # Limit output
|
|
165
|
+
"only_in_file2": only_in_second[:20],
|
|
166
|
+
}
|
|
@@ -0,0 +1,390 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: smartdocloader
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: An advanced universal document loader - load TXT, CSV, JSON, XML, YAML, INI, HTML, PDF, DOCX, PPTX, XLSX with smart auto-detection
|
|
5
|
+
Author: Your Name
|
|
6
|
+
Author-email: your.email@example.com
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Source, https://github.com/yourusername/smartdocloader
|
|
9
|
+
Keywords: document,loader,pdf,docx,xlsx,csv,json,yaml,parser,reader
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
21
|
+
Classifier: Topic :: Text Processing
|
|
22
|
+
Requires-Python: >=3.7
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE.txt
|
|
25
|
+
Requires-Dist: PyPDF2
|
|
26
|
+
Requires-Dist: python-docx
|
|
27
|
+
Requires-Dist: python-pptx
|
|
28
|
+
Requires-Dist: openpyxl
|
|
29
|
+
Provides-Extra: yaml
|
|
30
|
+
Requires-Dist: pyyaml; extra == "yaml"
|
|
31
|
+
Provides-Extra: all
|
|
32
|
+
Requires-Dist: pyyaml; extra == "all"
|
|
33
|
+
Dynamic: author
|
|
34
|
+
Dynamic: author-email
|
|
35
|
+
Dynamic: classifier
|
|
36
|
+
Dynamic: description
|
|
37
|
+
Dynamic: description-content-type
|
|
38
|
+
Dynamic: keywords
|
|
39
|
+
Dynamic: license
|
|
40
|
+
Dynamic: license-file
|
|
41
|
+
Dynamic: project-url
|
|
42
|
+
Dynamic: provides-extra
|
|
43
|
+
Dynamic: requires-dist
|
|
44
|
+
Dynamic: requires-python
|
|
45
|
+
Dynamic: summary
|
|
46
|
+
|
|
47
|
+
# smartdocloader
|
|
48
|
+
|
|
49
|
+
An advanced universal document loader for Python. Smart auto-detection, batch loading, content search, and support for 11+ file formats.
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install smartdocloader
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
For YAML support:
|
|
58
|
+
```bash
|
|
59
|
+
pip install smartdocloader[yaml]
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Features
|
|
63
|
+
|
|
64
|
+
- **Auto-detection**: Automatically detects file type and uses the right loader
|
|
65
|
+
- **Batch loading**: Load multiple files in one call
|
|
66
|
+
- **11 formats supported**: TXT, CSV, JSON, XML, YAML, INI, HTML, PDF, DOCX, PPTX, XLSX
|
|
67
|
+
- **Content search**: Search within loaded data
|
|
68
|
+
- **Export**: Convert loaded data to JSON
|
|
69
|
+
- **File comparison**: Compare content of two files
|
|
70
|
+
- **Advanced options**: Page ranges for PDFs, sheet selection for Excel, table extraction from Word
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## Quick Start
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from smartdocloader import auto_load
|
|
78
|
+
|
|
79
|
+
# Just pass any file - it auto-detects the format
|
|
80
|
+
data = auto_load("report.pdf")
|
|
81
|
+
data = auto_load("data.csv")
|
|
82
|
+
data = auto_load("config.yaml")
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## All Supported Formats
|
|
88
|
+
|
|
89
|
+
| Format | Extensions | Module |
|
|
90
|
+
|--------|-----------|--------|
|
|
91
|
+
| Plain Text | `.txt` | text_loader |
|
|
92
|
+
| CSV | `.csv` | text_loader |
|
|
93
|
+
| JSON | `.json` | text_loader |
|
|
94
|
+
| XML | `.xml` | text_loader |
|
|
95
|
+
| YAML | `.yaml`, `.yml` | text_loader |
|
|
96
|
+
| INI/Config | `.ini`, `.cfg` | text_loader |
|
|
97
|
+
| HTML | `.html`, `.htm` | text_loader |
|
|
98
|
+
| PDF | `.pdf` | doc_loader |
|
|
99
|
+
| Word | `.docx` | doc_loader |
|
|
100
|
+
| PowerPoint | `.pptx` | doc_loader |
|
|
101
|
+
| Excel | `.xlsx` | doc_loader |
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## Modules & Functions
|
|
106
|
+
|
|
107
|
+
### Module 1: `text_loader`
|
|
108
|
+
|
|
109
|
+
| Function | Description |
|
|
110
|
+
|----------|-------------|
|
|
111
|
+
| `load_txt(filepath, encoding)` | Load plain text file |
|
|
112
|
+
| `load_csv(filepath, delimiter, encoding)` | Load CSV as list of dicts |
|
|
113
|
+
| `load_json(filepath, encoding)` | Load and parse JSON |
|
|
114
|
+
| `load_xml(filepath)` | Load XML as nested dict |
|
|
115
|
+
| `load_yaml(filepath, encoding)` | Load YAML data |
|
|
116
|
+
| `load_ini(filepath, encoding)` | Load INI as nested dict |
|
|
117
|
+
| `load_html(filepath, encoding)` | Extract text from HTML |
|
|
118
|
+
|
|
119
|
+
### Module 2: `doc_loader`
|
|
120
|
+
|
|
121
|
+
| Function | Description |
|
|
122
|
+
|----------|-------------|
|
|
123
|
+
| `load_pdf(filepath, page_range)` | Extract text from PDF (optional page range) |
|
|
124
|
+
| `load_pdf_pages(filepath)` | Get text per page as a list |
|
|
125
|
+
| `load_pdf_metadata(filepath)` | Get PDF metadata (author, title, etc.) |
|
|
126
|
+
| `load_docx(filepath, include_tables)` | Load Word document paragraphs |
|
|
127
|
+
| `load_docx_with_styles(filepath)` | Load with style/formatting info |
|
|
128
|
+
| `load_pptx(filepath, include_notes)` | Load PowerPoint slides |
|
|
129
|
+
| `load_xlsx(filepath, sheet_name)` | Load Excel data from specific sheet |
|
|
130
|
+
| `load_xlsx_sheets(filepath)` | List all sheet names |
|
|
131
|
+
|
|
132
|
+
### Module 3: `smart_loader`
|
|
133
|
+
|
|
134
|
+
| Function | Description |
|
|
135
|
+
|----------|-------------|
|
|
136
|
+
| `auto_load(filepath)` | Auto-detect format and load |
|
|
137
|
+
| `batch_load(filepaths)` | Load multiple files at once |
|
|
138
|
+
| `get_file_info(filepath)` | Get file metadata (size, type, etc.) |
|
|
139
|
+
| `supported_formats()` | List all supported extensions |
|
|
140
|
+
|
|
141
|
+
### Module 4: `utils`
|
|
142
|
+
|
|
143
|
+
| Function | Description |
|
|
144
|
+
|----------|-------------|
|
|
145
|
+
| `search_content(data, keyword)` | Search within loaded content |
|
|
146
|
+
| `convert_to_text(data)` | Convert any loaded data to plain text |
|
|
147
|
+
| `word_count(data)` | Count words in loaded content |
|
|
148
|
+
| `export_to_json(data, output_path)` | Export loaded data to JSON file |
|
|
149
|
+
| `compare_files(filepath1, filepath2)` | Compare two files |
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
## Usage Examples
|
|
154
|
+
|
|
155
|
+
### Auto-Loading (Smart Detection)
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
from smartdocloader import auto_load
|
|
159
|
+
|
|
160
|
+
# Just pass any file path - format is auto-detected
|
|
161
|
+
pdf_content = auto_load("report.pdf")
|
|
162
|
+
csv_data = auto_load("students.csv")
|
|
163
|
+
config = auto_load("settings.yaml")
|
|
164
|
+
|
|
165
|
+
print(pdf_content[:100])
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### Batch Loading Multiple Files
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
from smartdocloader import batch_load
|
|
172
|
+
|
|
173
|
+
files = ["data.csv", "report.pdf", "config.json", "notes.txt"]
|
|
174
|
+
results = batch_load(files)
|
|
175
|
+
|
|
176
|
+
for filepath, content in results.items():
|
|
177
|
+
if "error" in content if isinstance(content, dict) else False:
|
|
178
|
+
print(f"Failed: {filepath} - {content['error']}")
|
|
179
|
+
else:
|
|
180
|
+
print(f"Loaded: {filepath}")
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
### Loading PDFs with Options
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
from smartdocloader import load_pdf, load_pdf_pages, load_pdf_metadata
|
|
187
|
+
|
|
188
|
+
# Load entire PDF
|
|
189
|
+
full_text = load_pdf("book.pdf")
|
|
190
|
+
|
|
191
|
+
# Load only pages 0-4 (first 5 pages)
|
|
192
|
+
intro = load_pdf("book.pdf", page_range=(0, 5))
|
|
193
|
+
|
|
194
|
+
# Get text per page
|
|
195
|
+
pages = load_pdf_pages("book.pdf")
|
|
196
|
+
print(f"Page 1: {pages[0][:100]}")
|
|
197
|
+
print(f"Total pages: {len(pages)}")
|
|
198
|
+
|
|
199
|
+
# Get metadata
|
|
200
|
+
meta = load_pdf_metadata("book.pdf")
|
|
201
|
+
print(f"Author: {meta['author']}")
|
|
202
|
+
print(f"Title: {meta['title']}")
|
|
203
|
+
print(f"Pages: {meta['page_count']}")
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
### Loading Word Documents
|
|
207
|
+
|
|
208
|
+
```python
|
|
209
|
+
from smartdocloader import load_docx, load_docx_with_styles
|
|
210
|
+
|
|
211
|
+
# Basic loading
|
|
212
|
+
paragraphs = load_docx("report.docx")
|
|
213
|
+
for p in paragraphs:
|
|
214
|
+
print(p)
|
|
215
|
+
|
|
216
|
+
# With tables included
|
|
217
|
+
content = load_docx("report.docx", include_tables=True)
|
|
218
|
+
for item in content:
|
|
219
|
+
print(item)
|
|
220
|
+
|
|
221
|
+
# With style information
|
|
222
|
+
styled = load_docx_with_styles("report.docx")
|
|
223
|
+
for para in styled:
|
|
224
|
+
if para["bold"]:
|
|
225
|
+
print(f"[BOLD] {para['text']}")
|
|
226
|
+
else:
|
|
227
|
+
print(f" {para['text']}")
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
### Loading Excel with Sheet Selection
|
|
231
|
+
|
|
232
|
+
```python
|
|
233
|
+
from smartdocloader import load_xlsx, load_xlsx_sheets
|
|
234
|
+
|
|
235
|
+
# See available sheets
|
|
236
|
+
sheets = load_xlsx_sheets("financials.xlsx")
|
|
237
|
+
print(f"Sheets: {sheets}")
|
|
238
|
+
|
|
239
|
+
# Load specific sheet
|
|
240
|
+
q1_data = load_xlsx("financials.xlsx", sheet_name="Q1")
|
|
241
|
+
for row in q1_data:
|
|
242
|
+
print(row)
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
### Loading PowerPoint with Notes
|
|
246
|
+
|
|
247
|
+
```python
|
|
248
|
+
from smartdocloader import load_pptx
|
|
249
|
+
|
|
250
|
+
slides = load_pptx("lecture.pptx", include_notes=True)
|
|
251
|
+
for slide in slides:
|
|
252
|
+
print(f"--- Slide {slide['slide_number']} ---")
|
|
253
|
+
for text in slide["text"]:
|
|
254
|
+
print(f" {text}")
|
|
255
|
+
if "notes" in slide and slide["notes"]:
|
|
256
|
+
print(f" [Notes: {slide['notes']}]")
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
### Loading YAML Configuration
|
|
260
|
+
|
|
261
|
+
```python
|
|
262
|
+
from smartdocloader import load_yaml
|
|
263
|
+
|
|
264
|
+
config = load_yaml("docker-compose.yml")
|
|
265
|
+
print(config["services"])
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
### Loading INI/Config Files
|
|
269
|
+
|
|
270
|
+
```python
|
|
271
|
+
from smartdocloader import load_ini
|
|
272
|
+
|
|
273
|
+
settings = load_ini("app.ini")
|
|
274
|
+
print(settings["database"]["host"])
|
|
275
|
+
print(settings["database"]["port"])
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
### Loading HTML (Text Extraction)
|
|
279
|
+
|
|
280
|
+
```python
|
|
281
|
+
from smartdocloader import load_html
|
|
282
|
+
|
|
283
|
+
text = load_html("page.html")
|
|
284
|
+
print(text) # Clean text without HTML tags
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
### Searching Within Loaded Content
|
|
288
|
+
|
|
289
|
+
```python
|
|
290
|
+
from smartdocloader import auto_load, search_content
|
|
291
|
+
|
|
292
|
+
# Load any file
|
|
293
|
+
data = auto_load("students.csv")
|
|
294
|
+
|
|
295
|
+
# Search for a keyword
|
|
296
|
+
matches = search_content(data, "Ahmed")
|
|
297
|
+
print(f"Found {len(matches)} matches:")
|
|
298
|
+
for match in matches:
|
|
299
|
+
print(f" {match}")
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
### Converting to Plain Text
|
|
303
|
+
|
|
304
|
+
```python
|
|
305
|
+
from smartdocloader import auto_load, convert_to_text
|
|
306
|
+
|
|
307
|
+
# Load structured data
|
|
308
|
+
data = auto_load("grades.xlsx")
|
|
309
|
+
|
|
310
|
+
# Convert to flat text
|
|
311
|
+
text = convert_to_text(data)
|
|
312
|
+
print(text)
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
### Exporting to JSON
|
|
316
|
+
|
|
317
|
+
```python
|
|
318
|
+
from smartdocloader import auto_load, export_to_json
|
|
319
|
+
|
|
320
|
+
# Load a Word document
|
|
321
|
+
data = auto_load("report.docx")
|
|
322
|
+
|
|
323
|
+
# Export as JSON for further processing
|
|
324
|
+
export_to_json(data, "report_output.json")
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
### Comparing Two Files
|
|
328
|
+
|
|
329
|
+
```python
|
|
330
|
+
from smartdocloader import compare_files
|
|
331
|
+
|
|
332
|
+
result = compare_files("version1.txt", "version2.txt")
|
|
333
|
+
print(f"Identical: {result['identical']}")
|
|
334
|
+
print(f"File 1: {result['file1_lines']} lines, {result['file1_words']} words")
|
|
335
|
+
print(f"File 2: {result['file2_lines']} lines, {result['file2_words']} words")
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
### Getting File Info
|
|
339
|
+
|
|
340
|
+
```python
|
|
341
|
+
from smartdocloader import get_file_info
|
|
342
|
+
|
|
343
|
+
info = get_file_info("report.pdf")
|
|
344
|
+
print(f"Name: {info['name']}")
|
|
345
|
+
print(f"Size: {info['size_readable']}")
|
|
346
|
+
print(f"Supported: {info['is_supported']}")
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
### Listing Supported Formats
|
|
350
|
+
|
|
351
|
+
```python
|
|
352
|
+
from smartdocloader import supported_formats
|
|
353
|
+
|
|
354
|
+
formats = supported_formats()
|
|
355
|
+
print(f"Supported: {', '.join(formats)}")
|
|
356
|
+
```
|
|
357
|
+
|
|
358
|
+
---
|
|
359
|
+
|
|
360
|
+
## Error Handling
|
|
361
|
+
|
|
362
|
+
```python
|
|
363
|
+
from smartdocloader import auto_load
|
|
364
|
+
|
|
365
|
+
try:
|
|
366
|
+
data = auto_load("unknown.xyz")
|
|
367
|
+
except ValueError as e:
|
|
368
|
+
print(f"Format error: {e}")
|
|
369
|
+
except FileNotFoundError as e:
|
|
370
|
+
print(f"File missing: {e}")
|
|
371
|
+
except Exception as e:
|
|
372
|
+
print(f"Error: {e}")
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
---
|
|
376
|
+
|
|
377
|
+
## Requirements
|
|
378
|
+
|
|
379
|
+
- Python >= 3.7
|
|
380
|
+
- PyPDF2
|
|
381
|
+
- python-docx
|
|
382
|
+
- python-pptx
|
|
383
|
+
- openpyxl
|
|
384
|
+
- pyyaml (optional, for YAML support)
|
|
385
|
+
|
|
386
|
+
---
|
|
387
|
+
|
|
388
|
+
## License
|
|
389
|
+
|
|
390
|
+
MIT
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
smartdocloader/__init__.py,sha256=F7QQOV_yB811Inqxoepbe0aLSDCCQ7GUmthr4ONlPj4,1265
|
|
2
|
+
smartdocloader/doc_loader.py,sha256=YwWnDnx8i0ZtXifKdVXjj5ZO51vbih4oUqOjINqEqII,5603
|
|
3
|
+
smartdocloader/smart_loader.py,sha256=33854mZ14Z6rIo928FxNzJ-kjUteAeDH9f9lQQeYyOU,3263
|
|
4
|
+
smartdocloader/text_loader.py,sha256=3svOQn6rnabJOw995zneH9fmzejJIhwYf92aGN6ogFM,4572
|
|
5
|
+
smartdocloader/utils.py,sha256=c6g7jIscWUgKMTqGt_ORZrBTA5lvPXLWtcHofNjrlhw,5060
|
|
6
|
+
smartdocloader-1.0.0.dist-info/licenses/LICENSE.txt,sha256=u8nMEhpntc7IP_2mYqSBf-ma-DlZ96r0xQyTAd3G5T8,1062
|
|
7
|
+
smartdocloader-1.0.0.dist-info/METADATA,sha256=6y5EDLuQ1umRIpHBGzmoSqLdgzR9_QlqgPggXOjpYis,10180
|
|
8
|
+
smartdocloader-1.0.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
9
|
+
smartdocloader-1.0.0.dist-info/top_level.txt,sha256=H6Vf0XUxub-LDjLTJdtsclakQ6II551OwamRyW-51_c,15
|
|
10
|
+
smartdocloader-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Copyright (c) 2026
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
5
|
+
in the Software without restriction, including without limitation the rights
|
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
8
|
+
furnished to do so, subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
|
11
|
+
copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
19
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
smartdocloader
|