smartdocloader 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,48 @@
1
+ """
2
+ smartdocloader - An advanced universal document loader library.
3
+ Load content from multiple file formats with a single smart API.
4
+ Supports: TXT, CSV, JSON, XML, YAML, INI, HTML, PDF, DOCX, PPTX, XLSX
5
+ """
6
+
7
+ from .text_loader import load_txt, load_csv, load_json, load_xml, load_yaml, load_ini, load_html
8
+ from .doc_loader import (
9
+ load_pdf, load_pdf_pages, load_pdf_metadata,
10
+ load_docx, load_docx_with_styles,
11
+ load_pptx,
12
+ load_xlsx, load_xlsx_sheets,
13
+ )
14
+ from .smart_loader import auto_load, batch_load, get_file_info, supported_formats
15
+ from .utils import search_content, convert_to_text, word_count, export_to_json, compare_files
16
+
17
+ __version__ = "1.0.0"
18
+
19
+ __all__ = [
20
+ # text_loader
21
+ "load_txt",
22
+ "load_csv",
23
+ "load_json",
24
+ "load_xml",
25
+ "load_yaml",
26
+ "load_ini",
27
+ "load_html",
28
+ # doc_loader
29
+ "load_pdf",
30
+ "load_pdf_pages",
31
+ "load_pdf_metadata",
32
+ "load_docx",
33
+ "load_docx_with_styles",
34
+ "load_pptx",
35
+ "load_xlsx",
36
+ "load_xlsx_sheets",
37
+ # smart_loader
38
+ "auto_load",
39
+ "batch_load",
40
+ "get_file_info",
41
+ "supported_formats",
42
+ # utils
43
+ "search_content",
44
+ "convert_to_text",
45
+ "word_count",
46
+ "export_to_json",
47
+ "compare_files",
48
+ ]
@@ -0,0 +1,192 @@
1
+ """
2
+ doc_loader module - Load common document formats.
3
+ Supports: .pdf, .docx, .pptx, .xlsx with advanced options
4
+ """
5
+
6
+ import openpyxl
7
+ from docx import Document as DocxDocument
8
+ from pptx import Presentation
9
+ from PyPDF2 import PdfReader
10
+
11
+
12
+ def load_pdf(filepath, page_range=None):
13
+ """Load a .pdf file and return all text content as a string.
14
+
15
+ Args:
16
+ filepath: Path to the .pdf file
17
+ page_range: Optional tuple (start, end) for specific pages (0-indexed).
18
+ If None, loads all pages.
19
+
20
+ Returns:
21
+ str: Extracted text from specified pages joined with newlines
22
+ """
23
+ reader = PdfReader(filepath)
24
+ pages = reader.pages
25
+
26
+ if page_range:
27
+ start, end = page_range
28
+ pages = pages[start:end]
29
+
30
+ texts = []
31
+ for page in pages:
32
+ text = page.extract_text()
33
+ if text:
34
+ texts.append(text)
35
+ return "\n".join(texts)
36
+
37
+
38
+ def load_pdf_pages(filepath):
39
+ """Load a .pdf file and return text per page as a list.
40
+
41
+ Args:
42
+ filepath: Path to the .pdf file
43
+
44
+ Returns:
45
+ list[str]: List where each element is the text of one page
46
+ """
47
+ reader = PdfReader(filepath)
48
+ pages = []
49
+ for page in reader.pages:
50
+ text = page.extract_text()
51
+ pages.append(text if text else "")
52
+ return pages
53
+
54
+
55
+ def load_pdf_metadata(filepath):
56
+ """Extract metadata from a PDF file.
57
+
58
+ Args:
59
+ filepath: Path to the .pdf file
60
+
61
+ Returns:
62
+ dict: Metadata including title, author, subject, page_count, etc.
63
+ """
64
+ reader = PdfReader(filepath)
65
+ meta = reader.metadata
66
+ info = {
67
+ "title": meta.title if meta else None,
68
+ "author": meta.author if meta else None,
69
+ "subject": meta.subject if meta else None,
70
+ "creator": meta.creator if meta else None,
71
+ "producer": meta.producer if meta else None,
72
+ "page_count": len(reader.pages),
73
+ }
74
+ return info
75
+
76
+
77
+ def load_docx(filepath, include_tables=False):
78
+ """Load a .docx file and return all paragraph text as a list of strings.
79
+
80
+ Args:
81
+ filepath: Path to the .docx file
82
+ include_tables: If True, also extract text from tables
83
+
84
+ Returns:
85
+ list[str]: List of paragraph texts (empty paragraphs skipped).
86
+ If include_tables=True, table rows are appended as dicts.
87
+ """
88
+ doc = DocxDocument(filepath)
89
+ result = [para.text for para in doc.paragraphs if para.text.strip()]
90
+
91
+ if include_tables:
92
+ for table in doc.tables:
93
+ headers = [cell.text.strip() for cell in table.rows[0].cells]
94
+ for row in table.rows[1:]:
95
+ row_data = {headers[i]: cell.text.strip() for i, cell in enumerate(row.cells)}
96
+ result.append(row_data)
97
+
98
+ return result
99
+
100
+
101
+ def load_docx_with_styles(filepath):
102
+ """Load a .docx file and return paragraphs with their style info.
103
+
104
+ Args:
105
+ filepath: Path to the .docx file
106
+
107
+ Returns:
108
+ list[dict]: Each dict has 'text', 'style', and 'bold' keys
109
+ """
110
+ doc = DocxDocument(filepath)
111
+ result = []
112
+ for para in doc.paragraphs:
113
+ if para.text.strip():
114
+ result.append({
115
+ "text": para.text,
116
+ "style": para.style.name if para.style else None,
117
+ "bold": any(run.bold for run in para.runs),
118
+ })
119
+ return result
120
+
121
+
122
+ def load_pptx(filepath, include_notes=False):
123
+ """Load a .pptx file and return slide text as a list of lists.
124
+
125
+ Args:
126
+ filepath: Path to the .pptx file
127
+ include_notes: If True, include speaker notes for each slide
128
+
129
+ Returns:
130
+ list[dict]: Each dict has 'slide_number', 'text' (list), and optionally 'notes'
131
+ """
132
+ prs = Presentation(filepath)
133
+ slides = []
134
+ for i, slide in enumerate(prs.slides, 1):
135
+ slide_text = []
136
+ for shape in slide.shapes:
137
+ if shape.has_text_frame:
138
+ for paragraph in shape.text_frame.paragraphs:
139
+ text = paragraph.text.strip()
140
+ if text:
141
+ slide_text.append(text)
142
+
143
+ slide_data = {"slide_number": i, "text": slide_text}
144
+
145
+ if include_notes and slide.has_notes_slide:
146
+ notes_frame = slide.notes_slide.notes_text_frame
147
+ slide_data["notes"] = notes_frame.text.strip() if notes_frame else ""
148
+
149
+ slides.append(slide_data)
150
+ return slides
151
+
152
+
153
+ def load_xlsx(filepath, sheet_name=None):
154
+ """Load a .xlsx file and return data as a list of dictionaries.
155
+
156
+ Args:
157
+ filepath: Path to the .xlsx file
158
+ sheet_name: Optional sheet name to load. If None, loads the active sheet.
159
+
160
+ Returns:
161
+ list[dict]: Rows as dictionaries with first row as headers
162
+ """
163
+ wb = openpyxl.load_workbook(filepath, read_only=True)
164
+
165
+ if sheet_name:
166
+ ws = wb[sheet_name]
167
+ else:
168
+ ws = wb.active
169
+
170
+ rows = list(ws.iter_rows(values_only=True))
171
+ wb.close()
172
+
173
+ if not rows:
174
+ return []
175
+
176
+ headers = [str(h) if h is not None else f"col_{i}" for i, h in enumerate(rows[0])]
177
+ return [dict(zip(headers, row)) for row in rows[1:]]
178
+
179
+
180
+ def load_xlsx_sheets(filepath):
181
+ """Get all sheet names from an Excel file.
182
+
183
+ Args:
184
+ filepath: Path to the .xlsx file
185
+
186
+ Returns:
187
+ list[str]: List of sheet names
188
+ """
189
+ wb = openpyxl.load_workbook(filepath, read_only=True)
190
+ sheets = wb.sheetnames
191
+ wb.close()
192
+ return sheets
@@ -0,0 +1,116 @@
1
+ """
2
+ smart_loader module - Unified interface for loading any supported document.
3
+ Auto-detects file type and dispatches to the appropriate loader.
4
+ """
5
+
6
+ import os
7
+ from . import text_loader, doc_loader
8
+
9
+
10
+ # Map of file extensions to loader functions
11
+ _LOADERS = {
12
+ ".txt": text_loader.load_txt,
13
+ ".csv": text_loader.load_csv,
14
+ ".json": text_loader.load_json,
15
+ ".xml": text_loader.load_xml,
16
+ ".yaml": text_loader.load_yaml,
17
+ ".yml": text_loader.load_yaml,
18
+ ".ini": text_loader.load_ini,
19
+ ".cfg": text_loader.load_ini,
20
+ ".html": text_loader.load_html,
21
+ ".htm": text_loader.load_html,
22
+ ".pdf": doc_loader.load_pdf,
23
+ ".docx": doc_loader.load_docx,
24
+ ".pptx": doc_loader.load_pptx,
25
+ ".xlsx": doc_loader.load_xlsx,
26
+ }
27
+
28
+
29
+ def auto_load(filepath):
30
+ """Automatically detect file type and load using the appropriate loader.
31
+
32
+ Args:
33
+ filepath: Path to any supported file
34
+
35
+ Returns:
36
+ Loaded content (type depends on file format)
37
+
38
+ Raises:
39
+ ValueError: If the file format is not supported
40
+ FileNotFoundError: If the file does not exist
41
+ """
42
+ if not os.path.exists(filepath):
43
+ raise FileNotFoundError(f"File not found: {filepath}")
44
+
45
+ ext = os.path.splitext(filepath)[1].lower()
46
+
47
+ if ext not in _LOADERS:
48
+ raise ValueError(
49
+ f"Unsupported file format: '{ext}'. "
50
+ f"Supported formats: {', '.join(sorted(_LOADERS.keys()))}"
51
+ )
52
+
53
+ return _LOADERS[ext](filepath)
54
+
55
+
56
+ def batch_load(filepaths):
57
+ """Load multiple files at once using auto-detection.
58
+
59
+ Args:
60
+ filepaths: List of file paths to load
61
+
62
+ Returns:
63
+ dict: Mapping of filepath -> loaded content.
64
+ If a file fails, its value will be a dict with 'error' key.
65
+ """
66
+ results = {}
67
+ for fp in filepaths:
68
+ try:
69
+ results[fp] = auto_load(fp)
70
+ except Exception as e:
71
+ results[fp] = {"error": str(e)}
72
+ return results
73
+
74
+
75
+ def get_file_info(filepath):
76
+ """Get detailed file metadata.
77
+
78
+ Args:
79
+ filepath: Path to any file
80
+
81
+ Returns:
82
+ dict: File info including name, extension, size, modified time, and whether it's supported
83
+ """
84
+ if not os.path.exists(filepath):
85
+ raise FileNotFoundError(f"File not found: {filepath}")
86
+
87
+ stat = os.stat(filepath)
88
+ ext = os.path.splitext(filepath)[1].lower()
89
+
90
+ return {
91
+ "name": os.path.basename(filepath),
92
+ "extension": ext,
93
+ "size_bytes": stat.st_size,
94
+ "size_readable": _format_size(stat.st_size),
95
+ "modified_timestamp": stat.st_mtime,
96
+ "is_supported": ext in _LOADERS,
97
+ "absolute_path": os.path.abspath(filepath),
98
+ }
99
+
100
+
101
+ def supported_formats():
102
+ """Get a list of all supported file formats.
103
+
104
+ Returns:
105
+ list[str]: Sorted list of supported file extensions
106
+ """
107
+ return sorted(set(_LOADERS.keys()))
108
+
109
+
110
+ def _format_size(size_bytes):
111
+ """Convert bytes to human-readable size string."""
112
+ for unit in ["B", "KB", "MB", "GB"]:
113
+ if size_bytes < 1024:
114
+ return f"{size_bytes:.1f} {unit}"
115
+ size_bytes /= 1024
116
+ return f"{size_bytes:.1f} TB"
@@ -0,0 +1,160 @@
1
+ """
2
+ text_loader module - Load text-based document formats.
3
+ Supports: .txt, .csv, .json, .xml, .yaml/.yml, .ini, .html
4
+ """
5
+
6
+ import csv
7
+ import json
8
+ import configparser
9
+ import xml.etree.ElementTree as ET
10
+ from html.parser import HTMLParser
11
+
12
+
13
+ def load_txt(filepath, encoding="utf-8"):
14
+ """Load a .txt file and return its content as a string.
15
+
16
+ Args:
17
+ filepath: Path to the .txt file
18
+ encoding: File encoding (default: utf-8)
19
+
20
+ Returns:
21
+ str: Full text content
22
+ """
23
+ with open(filepath, "r", encoding=encoding) as f:
24
+ return f.read()
25
+
26
+
27
+ def load_csv(filepath, delimiter=",", encoding="utf-8"):
28
+ """Load a .csv file and return its content as a list of dictionaries.
29
+
30
+ Args:
31
+ filepath: Path to the .csv file
32
+ delimiter: Column separator (default: comma)
33
+ encoding: File encoding (default: utf-8)
34
+
35
+ Returns:
36
+ list[dict]: List of row dictionaries with headers as keys
37
+ """
38
+ with open(filepath, "r", encoding=encoding, newline="") as f:
39
+ reader = csv.DictReader(f, delimiter=delimiter)
40
+ return list(reader)
41
+
42
+
43
+ def load_json(filepath, encoding="utf-8"):
44
+ """Load a .json file and return parsed data.
45
+
46
+ Args:
47
+ filepath: Path to the .json file
48
+ encoding: File encoding (default: utf-8)
49
+
50
+ Returns:
51
+ dict or list: Parsed JSON data
52
+ """
53
+ with open(filepath, "r", encoding=encoding) as f:
54
+ return json.load(f)
55
+
56
+
57
+ def load_xml(filepath):
58
+ """Load a .xml file and return a nested dictionary representation.
59
+
60
+ Args:
61
+ filepath: Path to the .xml file
62
+
63
+ Returns:
64
+ dict: Nested dictionary with tag, attributes, text, and children
65
+ """
66
+ tree = ET.parse(filepath)
67
+ root = tree.getroot()
68
+
69
+ def element_to_dict(element):
70
+ result = {"tag": element.tag}
71
+ if element.attrib:
72
+ result["attributes"] = dict(element.attrib)
73
+ if element.text and element.text.strip():
74
+ result["text"] = element.text.strip()
75
+ children = [element_to_dict(child) for child in element]
76
+ if children:
77
+ result["children"] = children
78
+ return result
79
+
80
+ return element_to_dict(root)
81
+
82
+
83
+ def load_yaml(filepath, encoding="utf-8"):
84
+ """Load a .yaml/.yml file and return parsed data.
85
+
86
+ Args:
87
+ filepath: Path to the .yaml or .yml file
88
+ encoding: File encoding (default: utf-8)
89
+
90
+ Returns:
91
+ dict or list: Parsed YAML data
92
+
93
+ Raises:
94
+ ImportError: If PyYAML is not installed
95
+ """
96
+ try:
97
+ import yaml
98
+ except ImportError:
99
+ raise ImportError("PyYAML is required for YAML support. Install it with: pip install pyyaml")
100
+
101
+ with open(filepath, "r", encoding=encoding) as f:
102
+ return yaml.safe_load(f)
103
+
104
+
105
+ def load_ini(filepath, encoding="utf-8"):
106
+ """Load a .ini/.cfg file and return as a nested dictionary.
107
+
108
+ Args:
109
+ filepath: Path to the .ini file
110
+ encoding: File encoding (default: utf-8)
111
+
112
+ Returns:
113
+ dict: Nested dict where keys are sections and values are dicts of key-value pairs
114
+ """
115
+ config = configparser.ConfigParser()
116
+ config.read(filepath, encoding=encoding)
117
+ return {section: dict(config[section]) for section in config.sections()}
118
+
119
+
120
+ def load_html(filepath, encoding="utf-8"):
121
+ """Load an .html file and extract visible text content (strips tags).
122
+
123
+ Args:
124
+ filepath: Path to the .html file
125
+ encoding: File encoding (default: utf-8)
126
+
127
+ Returns:
128
+ str: Extracted text content without HTML tags
129
+ """
130
+
131
+ class _HTMLTextExtractor(HTMLParser):
132
+ def __init__(self):
133
+ super().__init__()
134
+ self._texts = []
135
+ self._skip_tags = {"script", "style"}
136
+ self._skip = False
137
+
138
+ def handle_starttag(self, tag, attrs):
139
+ if tag.lower() in self._skip_tags:
140
+ self._skip = True
141
+
142
+ def handle_endtag(self, tag):
143
+ if tag.lower() in self._skip_tags:
144
+ self._skip = False
145
+
146
+ def handle_data(self, data):
147
+ if not self._skip:
148
+ text = data.strip()
149
+ if text:
150
+ self._texts.append(text)
151
+
152
+ def get_text(self):
153
+ return "\n".join(self._texts)
154
+
155
+ with open(filepath, "r", encoding=encoding) as f:
156
+ content = f.read()
157
+
158
+ extractor = _HTMLTextExtractor()
159
+ extractor.feed(content)
160
+ return extractor.get_text()
@@ -0,0 +1,166 @@
1
+ """
2
+ utils module - Utility functions for content processing and search.
3
+ """
4
+
5
+ import os
6
+ import json
7
+
8
+
9
+ def search_content(data, keyword, case_sensitive=False):
10
+ """Search for a keyword within loaded content.
11
+
12
+ Works with strings, lists, and dictionaries returned by loader functions.
13
+
14
+ Args:
15
+ data: Loaded content (str, list, or dict)
16
+ keyword: Text to search for
17
+ case_sensitive: Whether search is case-sensitive (default: False)
18
+
19
+ Returns:
20
+ list: Matching items/lines containing the keyword
21
+ """
22
+ if not case_sensitive:
23
+ keyword = keyword.lower()
24
+
25
+ matches = []
26
+
27
+ if isinstance(data, str):
28
+ for line in data.splitlines():
29
+ target = line if case_sensitive else line.lower()
30
+ if keyword in target:
31
+ matches.append(line)
32
+
33
+ elif isinstance(data, list):
34
+ for item in data:
35
+ if isinstance(item, str):
36
+ target = item if case_sensitive else item.lower()
37
+ if keyword in target:
38
+ matches.append(item)
39
+ elif isinstance(item, dict):
40
+ for value in item.values():
41
+ target = str(value) if case_sensitive else str(value).lower()
42
+ if keyword in target:
43
+ matches.append(item)
44
+ break
45
+
46
+ elif isinstance(data, dict):
47
+ for key, value in data.items():
48
+ target = str(value) if case_sensitive else str(value).lower()
49
+ if keyword in target:
50
+ matches.append({key: value})
51
+
52
+ return matches
53
+
54
+
55
+ def convert_to_text(data):
56
+ """Convert any loaded data to a plain text string.
57
+
58
+ Useful for standardizing output from different loaders into a single text format.
59
+
60
+ Args:
61
+ data: Loaded content (str, list, dict, or nested structures)
62
+
63
+ Returns:
64
+ str: Plain text representation of the data
65
+ """
66
+ if isinstance(data, str):
67
+ return data
68
+
69
+ if isinstance(data, list):
70
+ lines = []
71
+ for item in data:
72
+ if isinstance(item, str):
73
+ lines.append(item)
74
+ elif isinstance(item, dict):
75
+ lines.append(" | ".join(f"{k}: {v}" for k, v in item.items()))
76
+ elif isinstance(item, list):
77
+ lines.append(" | ".join(str(x) for x in item))
78
+ else:
79
+ lines.append(str(item))
80
+ return "\n".join(lines)
81
+
82
+ if isinstance(data, dict):
83
+ lines = []
84
+ for key, value in data.items():
85
+ if isinstance(value, dict):
86
+ lines.append(f"[{key}]")
87
+ for k, v in value.items():
88
+ lines.append(f" {k}: {v}")
89
+ else:
90
+ lines.append(f"{key}: {value}")
91
+ return "\n".join(lines)
92
+
93
+ return str(data)
94
+
95
+
96
+ def word_count(data):
97
+ """Count words in loaded content.
98
+
99
+ Args:
100
+ data: Loaded content (str, list, or dict)
101
+
102
+ Returns:
103
+ int: Total word count
104
+ """
105
+ text = convert_to_text(data)
106
+ return len(text.split())
107
+
108
+
109
+ def export_to_json(data, output_path, indent=2):
110
+ """Export any loaded data to a JSON file.
111
+
112
+ Args:
113
+ data: Loaded content to export
114
+ output_path: Path for the output JSON file
115
+ indent: JSON indentation level (default: 2)
116
+ """
117
+ # Convert non-serializable types
118
+ def make_serializable(obj):
119
+ if isinstance(obj, (str, int, float, bool, type(None))):
120
+ return obj
121
+ if isinstance(obj, dict):
122
+ return {str(k): make_serializable(v) for k, v in obj.items()}
123
+ if isinstance(obj, (list, tuple)):
124
+ return [make_serializable(item) for item in obj]
125
+ return str(obj)
126
+
127
+ serializable = make_serializable(data)
128
+ with open(output_path, "w", encoding="utf-8") as f:
129
+ json.dump(serializable, f, indent=indent, ensure_ascii=False)
130
+
131
+
132
+ def compare_files(filepath1, filepath2):
133
+ """Compare two files and return differences summary.
134
+
135
+ Both files are loaded using auto_load, then compared.
136
+
137
+ Args:
138
+ filepath1: Path to first file
139
+ filepath2: Path to second file
140
+
141
+ Returns:
142
+ dict: Comparison result with 'identical' flag and details
143
+ """
144
+ from .smart_loader import auto_load
145
+
146
+ data1 = auto_load(filepath1)
147
+ data2 = auto_load(filepath2)
148
+
149
+ text1 = convert_to_text(data1)
150
+ text2 = convert_to_text(data2)
151
+
152
+ lines1 = text1.splitlines()
153
+ lines2 = text2.splitlines()
154
+
155
+ only_in_first = [l for l in lines1 if l not in lines2]
156
+ only_in_second = [l for l in lines2 if l not in lines1]
157
+
158
+ return {
159
+ "identical": text1 == text2,
160
+ "file1_lines": len(lines1),
161
+ "file2_lines": len(lines2),
162
+ "file1_words": len(text1.split()),
163
+ "file2_words": len(text2.split()),
164
+ "only_in_file1": only_in_first[:20], # Limit output
165
+ "only_in_file2": only_in_second[:20],
166
+ }
@@ -0,0 +1,390 @@
1
+ Metadata-Version: 2.4
2
+ Name: smartdocloader
3
+ Version: 1.0.0
4
+ Summary: An advanced universal document loader - load TXT, CSV, JSON, XML, YAML, INI, HTML, PDF, DOCX, PPTX, XLSX with smart auto-detection
5
+ Author: Your Name
6
+ Author-email: your.email@example.com
7
+ License: MIT
8
+ Project-URL: Source, https://github.com/yourusername/smartdocloader
9
+ Keywords: document,loader,pdf,docx,xlsx,csv,json,yaml,parser,reader
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.7
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Software Development :: Libraries
21
+ Classifier: Topic :: Text Processing
22
+ Requires-Python: >=3.7
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE.txt
25
+ Requires-Dist: PyPDF2
26
+ Requires-Dist: python-docx
27
+ Requires-Dist: python-pptx
28
+ Requires-Dist: openpyxl
29
+ Provides-Extra: yaml
30
+ Requires-Dist: pyyaml; extra == "yaml"
31
+ Provides-Extra: all
32
+ Requires-Dist: pyyaml; extra == "all"
33
+ Dynamic: author
34
+ Dynamic: author-email
35
+ Dynamic: classifier
36
+ Dynamic: description
37
+ Dynamic: description-content-type
38
+ Dynamic: keywords
39
+ Dynamic: license
40
+ Dynamic: license-file
41
+ Dynamic: project-url
42
+ Dynamic: provides-extra
43
+ Dynamic: requires-dist
44
+ Dynamic: requires-python
45
+ Dynamic: summary
46
+
47
+ # smartdocloader
48
+
49
+ An advanced universal document loader for Python. Smart auto-detection, batch loading, content search, and support for 11+ file formats.
50
+
51
+ ## Installation
52
+
53
+ ```bash
54
+ pip install smartdocloader
55
+ ```
56
+
57
+ For YAML support:
58
+ ```bash
59
+ pip install smartdocloader[yaml]
60
+ ```
61
+
62
+ ## Features
63
+
64
+ - **Auto-detection**: Automatically detects file type and uses the right loader
65
+ - **Batch loading**: Load multiple files in one call
66
+ - **11 formats supported**: TXT, CSV, JSON, XML, YAML, INI, HTML, PDF, DOCX, PPTX, XLSX
67
+ - **Content search**: Search within loaded data
68
+ - **Export**: Convert loaded data to JSON
69
+ - **File comparison**: Compare content of two files
70
+ - **Advanced options**: Page ranges for PDFs, sheet selection for Excel, table extraction from Word
71
+
72
+ ---
73
+
74
+ ## Quick Start
75
+
76
+ ```python
77
+ from smartdocloader import auto_load
78
+
79
+ # Just pass any file - it auto-detects the format
80
+ data = auto_load("report.pdf")
81
+ data = auto_load("data.csv")
82
+ data = auto_load("config.yaml")
83
+ ```
84
+
85
+ ---
86
+
87
+ ## All Supported Formats
88
+
89
+ | Format | Extensions | Module |
90
+ |--------|-----------|--------|
91
+ | Plain Text | `.txt` | text_loader |
92
+ | CSV | `.csv` | text_loader |
93
+ | JSON | `.json` | text_loader |
94
+ | XML | `.xml` | text_loader |
95
+ | YAML | `.yaml`, `.yml` | text_loader |
96
+ | INI/Config | `.ini`, `.cfg` | text_loader |
97
+ | HTML | `.html`, `.htm` | text_loader |
98
+ | PDF | `.pdf` | doc_loader |
99
+ | Word | `.docx` | doc_loader |
100
+ | PowerPoint | `.pptx` | doc_loader |
101
+ | Excel | `.xlsx` | doc_loader |
102
+
103
+ ---
104
+
105
+ ## Modules & Functions
106
+
107
+ ### Module 1: `text_loader`
108
+
109
+ | Function | Description |
110
+ |----------|-------------|
111
+ | `load_txt(filepath, encoding)` | Load plain text file |
112
+ | `load_csv(filepath, delimiter, encoding)` | Load CSV as list of dicts |
113
+ | `load_json(filepath, encoding)` | Load and parse JSON |
114
+ | `load_xml(filepath)` | Load XML as nested dict |
115
+ | `load_yaml(filepath, encoding)` | Load YAML data |
116
+ | `load_ini(filepath, encoding)` | Load INI as nested dict |
117
+ | `load_html(filepath, encoding)` | Extract text from HTML |
118
+
119
+ ### Module 2: `doc_loader`
120
+
121
+ | Function | Description |
122
+ |----------|-------------|
123
+ | `load_pdf(filepath, page_range)` | Extract text from PDF (optional page range) |
124
+ | `load_pdf_pages(filepath)` | Get text per page as a list |
125
+ | `load_pdf_metadata(filepath)` | Get PDF metadata (author, title, etc.) |
126
+ | `load_docx(filepath, include_tables)` | Load Word document paragraphs |
127
+ | `load_docx_with_styles(filepath)` | Load with style/formatting info |
128
+ | `load_pptx(filepath, include_notes)` | Load PowerPoint slides |
129
+ | `load_xlsx(filepath, sheet_name)` | Load Excel data from specific sheet |
130
+ | `load_xlsx_sheets(filepath)` | List all sheet names |
131
+
132
+ ### Module 3: `smart_loader`
133
+
134
+ | Function | Description |
135
+ |----------|-------------|
136
+ | `auto_load(filepath)` | Auto-detect format and load |
137
+ | `batch_load(filepaths)` | Load multiple files at once |
138
+ | `get_file_info(filepath)` | Get file metadata (size, type, etc.) |
139
+ | `supported_formats()` | List all supported extensions |
140
+
141
+ ### Module 4: `utils`
142
+
143
+ | Function | Description |
144
+ |----------|-------------|
145
+ | `search_content(data, keyword)` | Search within loaded content |
146
+ | `convert_to_text(data)` | Convert any loaded data to plain text |
147
+ | `word_count(data)` | Count words in loaded content |
148
+ | `export_to_json(data, output_path)` | Export loaded data to JSON file |
149
+ | `compare_files(filepath1, filepath2)` | Compare two files |
150
+
151
+ ---
152
+
153
+ ## Usage Examples
154
+
155
+ ### Auto-Loading (Smart Detection)
156
+
157
+ ```python
158
+ from smartdocloader import auto_load
159
+
160
+ # Just pass any file path - format is auto-detected
161
+ pdf_content = auto_load("report.pdf")
162
+ csv_data = auto_load("students.csv")
163
+ config = auto_load("settings.yaml")
164
+
165
+ print(pdf_content[:100])
166
+ ```
167
+
168
+ ### Batch Loading Multiple Files
169
+
170
+ ```python
171
+ from smartdocloader import batch_load
172
+
173
+ files = ["data.csv", "report.pdf", "config.json", "notes.txt"]
174
+ results = batch_load(files)
175
+
176
+ for filepath, content in results.items():
177
+ if "error" in content if isinstance(content, dict) else False:
178
+ print(f"Failed: {filepath} - {content['error']}")
179
+ else:
180
+ print(f"Loaded: {filepath}")
181
+ ```
182
+
183
+ ### Loading PDFs with Options
184
+
185
+ ```python
186
+ from smartdocloader import load_pdf, load_pdf_pages, load_pdf_metadata
187
+
188
+ # Load entire PDF
189
+ full_text = load_pdf("book.pdf")
190
+
191
+ # Load only pages 0-4 (first 5 pages)
192
+ intro = load_pdf("book.pdf", page_range=(0, 5))
193
+
194
+ # Get text per page
195
+ pages = load_pdf_pages("book.pdf")
196
+ print(f"Page 1: {pages[0][:100]}")
197
+ print(f"Total pages: {len(pages)}")
198
+
199
+ # Get metadata
200
+ meta = load_pdf_metadata("book.pdf")
201
+ print(f"Author: {meta['author']}")
202
+ print(f"Title: {meta['title']}")
203
+ print(f"Pages: {meta['page_count']}")
204
+ ```
205
+
206
+ ### Loading Word Documents
207
+
208
+ ```python
209
+ from smartdocloader import load_docx, load_docx_with_styles
210
+
211
+ # Basic loading
212
+ paragraphs = load_docx("report.docx")
213
+ for p in paragraphs:
214
+ print(p)
215
+
216
+ # With tables included
217
+ content = load_docx("report.docx", include_tables=True)
218
+ for item in content:
219
+ print(item)
220
+
221
+ # With style information
222
+ styled = load_docx_with_styles("report.docx")
223
+ for para in styled:
224
+ if para["bold"]:
225
+ print(f"[BOLD] {para['text']}")
226
+ else:
227
+ print(f" {para['text']}")
228
+ ```
229
+
230
+ ### Loading Excel with Sheet Selection
231
+
232
+ ```python
233
+ from smartdocloader import load_xlsx, load_xlsx_sheets
234
+
235
+ # See available sheets
236
+ sheets = load_xlsx_sheets("financials.xlsx")
237
+ print(f"Sheets: {sheets}")
238
+
239
+ # Load specific sheet
240
+ q1_data = load_xlsx("financials.xlsx", sheet_name="Q1")
241
+ for row in q1_data:
242
+ print(row)
243
+ ```
244
+
245
+ ### Loading PowerPoint with Notes
246
+
247
+ ```python
248
+ from smartdocloader import load_pptx
249
+
250
+ slides = load_pptx("lecture.pptx", include_notes=True)
251
+ for slide in slides:
252
+ print(f"--- Slide {slide['slide_number']} ---")
253
+ for text in slide["text"]:
254
+ print(f" {text}")
255
+ if "notes" in slide and slide["notes"]:
256
+ print(f" [Notes: {slide['notes']}]")
257
+ ```
258
+
259
+ ### Loading YAML Configuration
260
+
261
+ ```python
262
+ from smartdocloader import load_yaml
263
+
264
+ config = load_yaml("docker-compose.yml")
265
+ print(config["services"])
266
+ ```
267
+
268
+ ### Loading INI/Config Files
269
+
270
+ ```python
271
+ from smartdocloader import load_ini
272
+
273
+ settings = load_ini("app.ini")
274
+ print(settings["database"]["host"])
275
+ print(settings["database"]["port"])
276
+ ```
277
+
278
+ ### Loading HTML (Text Extraction)
279
+
280
+ ```python
281
+ from smartdocloader import load_html
282
+
283
+ text = load_html("page.html")
284
+ print(text) # Clean text without HTML tags
285
+ ```
286
+
287
+ ### Searching Within Loaded Content
288
+
289
+ ```python
290
+ from smartdocloader import auto_load, search_content
291
+
292
+ # Load any file
293
+ data = auto_load("students.csv")
294
+
295
+ # Search for a keyword
296
+ matches = search_content(data, "Ahmed")
297
+ print(f"Found {len(matches)} matches:")
298
+ for match in matches:
299
+ print(f" {match}")
300
+ ```
301
+
302
+ ### Converting to Plain Text
303
+
304
+ ```python
305
+ from smartdocloader import auto_load, convert_to_text
306
+
307
+ # Load structured data
308
+ data = auto_load("grades.xlsx")
309
+
310
+ # Convert to flat text
311
+ text = convert_to_text(data)
312
+ print(text)
313
+ ```
314
+
315
+ ### Exporting to JSON
316
+
317
+ ```python
318
+ from smartdocloader import auto_load, export_to_json
319
+
320
+ # Load a Word document
321
+ data = auto_load("report.docx")
322
+
323
+ # Export as JSON for further processing
324
+ export_to_json(data, "report_output.json")
325
+ ```
326
+
327
+ ### Comparing Two Files
328
+
329
+ ```python
330
+ from smartdocloader import compare_files
331
+
332
+ result = compare_files("version1.txt", "version2.txt")
333
+ print(f"Identical: {result['identical']}")
334
+ print(f"File 1: {result['file1_lines']} lines, {result['file1_words']} words")
335
+ print(f"File 2: {result['file2_lines']} lines, {result['file2_words']} words")
336
+ ```
337
+
338
+ ### Getting File Info
339
+
340
+ ```python
341
+ from smartdocloader import get_file_info
342
+
343
+ info = get_file_info("report.pdf")
344
+ print(f"Name: {info['name']}")
345
+ print(f"Size: {info['size_readable']}")
346
+ print(f"Supported: {info['is_supported']}")
347
+ ```
348
+
349
+ ### Listing Supported Formats
350
+
351
+ ```python
352
+ from smartdocloader import supported_formats
353
+
354
+ formats = supported_formats()
355
+ print(f"Supported: {', '.join(formats)}")
356
+ ```
357
+
358
+ ---
359
+
360
+ ## Error Handling
361
+
362
+ ```python
363
+ from smartdocloader import auto_load
364
+
365
+ try:
366
+ data = auto_load("unknown.xyz")
367
+ except ValueError as e:
368
+ print(f"Format error: {e}")
369
+ except FileNotFoundError as e:
370
+ print(f"File missing: {e}")
371
+ except Exception as e:
372
+ print(f"Error: {e}")
373
+ ```
374
+
375
+ ---
376
+
377
+ ## Requirements
378
+
379
+ - Python >= 3.7
380
+ - PyPDF2
381
+ - python-docx
382
+ - python-pptx
383
+ - openpyxl
384
+ - pyyaml (optional, for YAML support)
385
+
386
+ ---
387
+
388
+ ## License
389
+
390
+ MIT
@@ -0,0 +1,10 @@
1
+ smartdocloader/__init__.py,sha256=F7QQOV_yB811Inqxoepbe0aLSDCCQ7GUmthr4ONlPj4,1265
2
+ smartdocloader/doc_loader.py,sha256=YwWnDnx8i0ZtXifKdVXjj5ZO51vbih4oUqOjINqEqII,5603
3
+ smartdocloader/smart_loader.py,sha256=33854mZ14Z6rIo928FxNzJ-kjUteAeDH9f9lQQeYyOU,3263
4
+ smartdocloader/text_loader.py,sha256=3svOQn6rnabJOw995zneH9fmzejJIhwYf92aGN6ogFM,4572
5
+ smartdocloader/utils.py,sha256=c6g7jIscWUgKMTqGt_ORZrBTA5lvPXLWtcHofNjrlhw,5060
6
+ smartdocloader-1.0.0.dist-info/licenses/LICENSE.txt,sha256=u8nMEhpntc7IP_2mYqSBf-ma-DlZ96r0xQyTAd3G5T8,1062
7
+ smartdocloader-1.0.0.dist-info/METADATA,sha256=6y5EDLuQ1umRIpHBGzmoSqLdgzR9_QlqgPggXOjpYis,10180
8
+ smartdocloader-1.0.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
9
+ smartdocloader-1.0.0.dist-info/top_level.txt,sha256=H6Vf0XUxub-LDjLTJdtsclakQ6II551OwamRyW-51_c,15
10
+ smartdocloader-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.10.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2026
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ smartdocloader