content-extraction 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. content_extraction-0.1.0/PKG-INFO +258 -0
  2. content_extraction-0.1.0/README.md +248 -0
  3. content_extraction-0.1.0/pyproject.toml +49 -0
  4. content_extraction-0.1.0/setup.cfg +4 -0
  5. content_extraction-0.1.0/src/content_extraction/__init__.py +0 -0
  6. content_extraction-0.1.0/src/content_extraction/common_std_io.py +50 -0
  7. content_extraction-0.1.0/src/content_extraction/do_ocr.py +199 -0
  8. content_extraction-0.1.0/src/content_extraction/dspy_modules.py +24 -0
  9. content_extraction-0.1.0/src/content_extraction/extract_from_pptx.py +174 -0
  10. content_extraction-0.1.0/src/content_extraction/file_handlers.py +280 -0
  11. content_extraction-0.1.0/src/content_extraction/fix_ocr.py +245 -0
  12. content_extraction-0.1.0/src/content_extraction/logging_config.py +13 -0
  13. content_extraction-0.1.0/src/content_extraction/parse_html.py +117 -0
  14. content_extraction-0.1.0/src/content_extraction/semantic_chunk_html.py +164 -0
  15. content_extraction-0.1.0/src/content_extraction/split_and_create_digest.py +134 -0
  16. content_extraction-0.1.0/src/content_extraction.egg-info/PKG-INFO +258 -0
  17. content_extraction-0.1.0/src/content_extraction.egg-info/SOURCES.txt +20 -0
  18. content_extraction-0.1.0/src/content_extraction.egg-info/dependency_links.txt +1 -0
  19. content_extraction-0.1.0/src/content_extraction.egg-info/requires.txt +3 -0
  20. content_extraction-0.1.0/src/content_extraction.egg-info/top_level.txt +2 -0
  21. content_extraction-0.1.0/tests/test_section_parser.py +35 -0
  22. content_extraction-0.1.0/tests/test_semantic_chunk_html.py +304 -0
@@ -0,0 +1,258 @@
1
+ Metadata-Version: 2.4
2
+ Name: content_extraction
3
+ Version: 0.1.0
4
+ Summary: Project dedicated to content extraction from unstructured files that contain some useful information.
5
+ Requires-Python: >=3.13
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: beautifulsoup4>=4.13.4
8
+ Requires-Dist: lxml>=6.0.0
9
+ Requires-Dist: python-pptx>=1.0.2
10
+
11
+ # HTML Content Extraction Tool
12
+
13
+ A powerful command-line tool for extracting structured content from HTML documents. Converts HTML sections into hierarchical JSON data while preserving formatting, links, and semantic structure.
14
+
15
+ ## Features
16
+
17
+ - **Hierarchical Parsing**: Automatically detects heading levels and creates nested section structures
18
+ - **HTML Preservation**: Maintains original formatting, links, and semantic elements
19
+ - **Smart Element Filtering**: Includes meaningful content while filtering out irrelevant elements
20
+ - **Flexible Input/Output**: Read from files or stdin, output to files or stdout
21
+ - **Section Support**: Works with existing `<section>`, `<article>`, and `<main>` elements
22
+ - **Custom Headings**: Supports both standard headings (`h1`-`h6`) and custom headings with `aria-level`
23
+
24
+ ## Installation
25
+
26
+ ```bash
27
+ # Install dependencies
28
+ pip install beautifulsoup4
29
+
30
+ # Clone or download this repository
31
+ git clone <repository-url>
32
+ cd content-extraction
33
+ ```
34
+
35
+ ## Usage
36
+
37
+ ### Basic Usage
38
+
39
+ ```bash
40
+ # Parse HTML file and output to stdout
41
+ python main.py example.html
42
+
43
+ # Parse with pretty-printed JSON
44
+ python main.py --pretty example.html
45
+
46
+ # Save output to file
47
+ python main.py example.html -o output.json
48
+
49
+ # Read from stdin
50
+ cat example.html | python main.py --pretty
51
+
52
+ # Verbose mode with debug information
53
+ python main.py --verbose example.html
54
+ ```
55
+
56
+ ### Command Line Options
57
+
58
+ ```
59
+ usage: main.py [-h] [-o FILE] [--pretty] [-v] [--version] [input_file]
60
+
61
+ Extract structured content from HTML documents
62
+
63
+ positional arguments:
64
+ input_file Input HTML file (if not provided, reads from stdin)
65
+
66
+ options:
67
+ -h, --help show this help message and exit
68
+ -o, --output FILE Output JSON file (if not provided, writes to stdout)
69
+ --pretty Pretty-print JSON output with indentation
70
+ -v, --verbose Show verbose output and debug information
71
+ --version show program's version number and exit
72
+ ```
73
+
74
+ ## Output Format
75
+
76
+ The tool outputs JSON with the following structure:
77
+
78
+ ```json
79
+ {
80
+ "title": "Section Title",
81
+ "text": "<p>HTML content preserved</p>",
82
+ "level": 1,
83
+ "subsections": [
84
+ {
85
+ "title": "Subsection Title",
86
+ "text": "<p>Subsection content</p>",
87
+ "level": 2,
88
+ "subsections": []
89
+ }
90
+ ]
91
+ }
92
+ ```
93
+
94
+ ### Fields
95
+
96
+ - **`title`**: Text content of the highest-level heading in the section
97
+ - **`text`**: All content except headings, with HTML formatting preserved
98
+ - **`level`**: Aria level of the main heading (1-6, or custom levels)
99
+ - **`subsections`**: Array of nested subsections with the same structure
100
+
101
+ ## Examples
102
+
103
+ ### Simple Section
104
+
105
+ **Input HTML:**
106
+ ```html
107
+ <section>
108
+ <h2>Getting Started</h2>
109
+ <p>Welcome to our <a href="/api">API</a>!</p>
110
+ <ul>
111
+ <li>Step 1: Register</li>
112
+ <li>Step 2: Get API key</li>
113
+ </ul>
114
+ </section>
115
+ ```
116
+
117
+ **Output:**
118
+ ```json
119
+ {
120
+ "title": "Getting Started",
121
+ "text": "<p>Welcome to our <a href=\"/api\">API</a>!</p>\n<ul>\n<li>Step 1: Register</li>\n<li>Step 2: Get API key</li>\n</ul>",
122
+ "level": 2,
123
+ "subsections": []
124
+ }
125
+ ```
126
+
127
+ ### Nested Sections
128
+
129
+ **Input HTML:**
130
+ ```html
131
+ <main>
132
+ <h1>Documentation</h1>
133
+ <p>Introduction text.</p>
134
+ <h2>Installation</h2>
135
+ <p>Installation instructions.</p>
136
+ <h3>Requirements</h3>
137
+ <p>System requirements.</p>
138
+ <h2>Usage</h2>
139
+ <p>Usage examples.</p>
140
+ </main>
141
+ ```
142
+
143
+ **Output:**
144
+ ```json
145
+ {
146
+ "title": "Documentation",
147
+ "text": "<p>Introduction text.</p>",
148
+ "level": 1,
149
+ "subsections": [
150
+ {
151
+ "title": "Installation",
152
+ "text": "<p>Installation instructions.</p>",
153
+ "level": 2,
154
+ "subsections": [
155
+ {
156
+ "title": "Requirements",
157
+ "text": "<p>System requirements.</p>",
158
+ "level": 3,
159
+ "subsections": []
160
+ }
161
+ ]
162
+ },
163
+ {
164
+ "title": "Usage",
165
+ "text": "<p>Usage examples.</p>",
166
+ "level": 2,
167
+ "subsections": []
168
+ }
169
+ ]
170
+ }
171
+ ```
172
+
173
+ ## Supported HTML Elements
174
+
175
+ ### Included Elements
176
+ - Paragraphs (`<p>`)
177
+ - Lists (`<ul>`, `<ol>`, `<li>`)
178
+ - Links (`<a>`)
179
+ - Formatting (`<strong>`, `<em>`, `<code>`, etc.)
180
+ - Semantic elements (`<section>`, `<article>`, `<aside>`, etc.)
181
+ - Tables (`<table>`, `<tr>`, `<td>`, etc.)
182
+ - Media (`<img>`, `<figure>`)
183
+ - Code blocks (`<pre>`, `<code>`)
184
+ - Quotes (`<blockquote>`, `<q>`)
185
+ - All other content elements with meaningful text
186
+
187
+ ### Excluded Elements
188
+ - Headings (processed separately as section titles)
189
+ - Script and style tags
190
+ - Meta elements
191
+ - Empty elements
192
+ - Elements containing headings (processed as subsections)
193
+
194
+ ## Smart Root Element Detection
195
+
196
+ The tool automatically detects the best root element in this priority order:
197
+
198
+ 1. `<main>` - Primary content area
199
+ 2. `<article>` - Standalone article content
200
+ 3. `<section>` - Document section
201
+ 4. `<body>` - Document body
202
+ 5. First substantial `<div>` - Fallback for div-based layouts
203
+ 6. Entire document - Last resort
204
+
205
+ ## Advanced Features
206
+
207
+ ### Custom Headings
208
+ Supports custom headings with ARIA attributes:
209
+
210
+ ```html
211
+ <div role="heading" aria-level="2">Custom Heading</div>
212
+ ```
213
+
214
+ ### Aria Level Overrides
215
+ Standard headings can have their levels overridden:
216
+
217
+ ```html
218
+ <h3 aria-level="1">This is treated as level 1</h3>
219
+ ```
220
+
221
+ ### Mixed Content
222
+ Handles complex layouts with mixed content types:
223
+
224
+ ```html
225
+ <div>
226
+ <h1>Main Title</h1>
227
+ <p>Introduction</p>
228
+ <section>
229
+ <h2>Section in Section</h2>
230
+ <p>Section content</p>
231
+ </section>
232
+ <h2>Regular Heading</h2>
233
+ <p>Regular content</p>
234
+ </div>
235
+ ```
236
+
237
+ ## Testing
238
+
239
+ Run the test suite:
240
+
241
+ ```bash
242
+ python -m pytest tests/ -v
243
+ ```
244
+
245
+ The project includes comprehensive tests covering:
246
+ - Basic parsing functionality
247
+ - Heading level detection
248
+ - Content extraction
249
+ - Section handling
250
+ - Edge cases and error conditions
251
+
252
+ ## License
253
+
254
+ This project is open source. See LICENSE file for details.
255
+
256
+ ## Contributing
257
+
258
+ Contributions are welcome! Please submit pull requests with tests for any new features.
@@ -0,0 +1,248 @@
1
+ # HTML Content Extraction Tool
2
+
3
+ A powerful command-line tool for extracting structured content from HTML documents. Converts HTML sections into hierarchical JSON data while preserving formatting, links, and semantic structure.
4
+
5
+ ## Features
6
+
7
+ - **Hierarchical Parsing**: Automatically detects heading levels and creates nested section structures
8
+ - **HTML Preservation**: Maintains original formatting, links, and semantic elements
9
+ - **Smart Element Filtering**: Includes meaningful content while filtering out irrelevant elements
10
+ - **Flexible Input/Output**: Read from files or stdin, output to files or stdout
11
+ - **Section Support**: Works with existing `<section>`, `<article>`, and `<main>` elements
12
+ - **Custom Headings**: Supports both standard headings (`h1`-`h6`) and custom headings with `aria-level`
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ # Install dependencies
18
+ pip install beautifulsoup4
19
+
20
+ # Clone or download this repository
21
+ git clone <repository-url>
22
+ cd content-extraction
23
+ ```
24
+
25
+ ## Usage
26
+
27
+ ### Basic Usage
28
+
29
+ ```bash
30
+ # Parse HTML file and output to stdout
31
+ python main.py example.html
32
+
33
+ # Parse with pretty-printed JSON
34
+ python main.py --pretty example.html
35
+
36
+ # Save output to file
37
+ python main.py example.html -o output.json
38
+
39
+ # Read from stdin
40
+ cat example.html | python main.py --pretty
41
+
42
+ # Verbose mode with debug information
43
+ python main.py --verbose example.html
44
+ ```
45
+
46
+ ### Command Line Options
47
+
48
+ ```
49
+ usage: main.py [-h] [-o FILE] [--pretty] [-v] [--version] [input_file]
50
+
51
+ Extract structured content from HTML documents
52
+
53
+ positional arguments:
54
+ input_file Input HTML file (if not provided, reads from stdin)
55
+
56
+ options:
57
+ -h, --help show this help message and exit
58
+ -o, --output FILE Output JSON file (if not provided, writes to stdout)
59
+ --pretty Pretty-print JSON output with indentation
60
+ -v, --verbose Show verbose output and debug information
61
+ --version show program's version number and exit
62
+ ```
63
+
64
+ ## Output Format
65
+
66
+ The tool outputs JSON with the following structure:
67
+
68
+ ```json
69
+ {
70
+ "title": "Section Title",
71
+ "text": "<p>HTML content preserved</p>",
72
+ "level": 1,
73
+ "subsections": [
74
+ {
75
+ "title": "Subsection Title",
76
+ "text": "<p>Subsection content</p>",
77
+ "level": 2,
78
+ "subsections": []
79
+ }
80
+ ]
81
+ }
82
+ ```
83
+
84
+ ### Fields
85
+
86
+ - **`title`**: Text content of the highest-level heading in the section
87
+ - **`text`**: All content except headings, with HTML formatting preserved
88
+ - **`level`**: Aria level of the main heading (1-6, or custom levels)
89
+ - **`subsections`**: Array of nested subsections with the same structure
90
+
91
+ ## Examples
92
+
93
+ ### Simple Section
94
+
95
+ **Input HTML:**
96
+ ```html
97
+ <section>
98
+ <h2>Getting Started</h2>
99
+ <p>Welcome to our <a href="/api">API</a>!</p>
100
+ <ul>
101
+ <li>Step 1: Register</li>
102
+ <li>Step 2: Get API key</li>
103
+ </ul>
104
+ </section>
105
+ ```
106
+
107
+ **Output:**
108
+ ```json
109
+ {
110
+ "title": "Getting Started",
111
+ "text": "<p>Welcome to our <a href=\"/api\">API</a>!</p>\n<ul>\n<li>Step 1: Register</li>\n<li>Step 2: Get API key</li>\n</ul>",
112
+ "level": 2,
113
+ "subsections": []
114
+ }
115
+ ```
116
+
117
+ ### Nested Sections
118
+
119
+ **Input HTML:**
120
+ ```html
121
+ <main>
122
+ <h1>Documentation</h1>
123
+ <p>Introduction text.</p>
124
+ <h2>Installation</h2>
125
+ <p>Installation instructions.</p>
126
+ <h3>Requirements</h3>
127
+ <p>System requirements.</p>
128
+ <h2>Usage</h2>
129
+ <p>Usage examples.</p>
130
+ </main>
131
+ ```
132
+
133
+ **Output:**
134
+ ```json
135
+ {
136
+ "title": "Documentation",
137
+ "text": "<p>Introduction text.</p>",
138
+ "level": 1,
139
+ "subsections": [
140
+ {
141
+ "title": "Installation",
142
+ "text": "<p>Installation instructions.</p>",
143
+ "level": 2,
144
+ "subsections": [
145
+ {
146
+ "title": "Requirements",
147
+ "text": "<p>System requirements.</p>",
148
+ "level": 3,
149
+ "subsections": []
150
+ }
151
+ ]
152
+ },
153
+ {
154
+ "title": "Usage",
155
+ "text": "<p>Usage examples.</p>",
156
+ "level": 2,
157
+ "subsections": []
158
+ }
159
+ ]
160
+ }
161
+ ```
162
+
163
+ ## Supported HTML Elements
164
+
165
+ ### Included Elements
166
+ - Paragraphs (`<p>`)
167
+ - Lists (`<ul>`, `<ol>`, `<li>`)
168
+ - Links (`<a>`)
169
+ - Formatting (`<strong>`, `<em>`, `<code>`, etc.)
170
+ - Semantic elements (`<section>`, `<article>`, `<aside>`, etc.)
171
+ - Tables (`<table>`, `<tr>`, `<td>`, etc.)
172
+ - Media (`<img>`, `<figure>`)
173
+ - Code blocks (`<pre>`, `<code>`)
174
+ - Quotes (`<blockquote>`, `<q>`)
175
+ - All other content elements with meaningful text
176
+
177
+ ### Excluded Elements
178
+ - Headings (processed separately as section titles)
179
+ - Script and style tags
180
+ - Meta elements
181
+ - Empty elements
182
+ - Elements containing headings (processed as subsections)
183
+
184
+ ## Smart Root Element Detection
185
+
186
+ The tool automatically detects the best root element in this priority order:
187
+
188
+ 1. `<main>` - Primary content area
189
+ 2. `<article>` - Standalone article content
190
+ 3. `<section>` - Document section
191
+ 4. `<body>` - Document body
192
+ 5. First substantial `<div>` - Fallback for div-based layouts
193
+ 6. Entire document - Last resort
194
+
195
+ ## Advanced Features
196
+
197
+ ### Custom Headings
198
+ Supports custom headings with ARIA attributes:
199
+
200
+ ```html
201
+ <div role="heading" aria-level="2">Custom Heading</div>
202
+ ```
203
+
204
+ ### Aria Level Overrides
205
+ Standard headings can have their levels overridden:
206
+
207
+ ```html
208
+ <h3 aria-level="1">This is treated as level 1</h3>
209
+ ```
210
+
211
+ ### Mixed Content
212
+ Handles complex layouts with mixed content types:
213
+
214
+ ```html
215
+ <div>
216
+ <h1>Main Title</h1>
217
+ <p>Introduction</p>
218
+ <section>
219
+ <h2>Section in Section</h2>
220
+ <p>Section content</p>
221
+ </section>
222
+ <h2>Regular Heading</h2>
223
+ <p>Regular content</p>
224
+ </div>
225
+ ```
226
+
227
+ ## Testing
228
+
229
+ Run the test suite:
230
+
231
+ ```bash
232
+ python -m pytest tests/ -v
233
+ ```
234
+
235
+ The project includes comprehensive tests covering:
236
+ - Basic parsing functionality
237
+ - Heading level detection
238
+ - Content extraction
239
+ - Section handling
240
+ - Edge cases and error conditions
241
+
242
+ ## License
243
+
244
+ This project is open source. See LICENSE file for details.
245
+
246
+ ## Contributing
247
+
248
+ Contributions are welcome! Please submit pull requests with tests for any new features.
@@ -0,0 +1,49 @@
1
+ [build-system]
2
+ requires = ["setuptools", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [tool.setuptools.packages.find]
6
+ where = ["src"]
7
+
8
+ [project]
9
+ name = "content_extraction"
10
+ version = "0.1.0"
11
+ description = "Project dedicated to content extraction from unstructured files that contain some useful information."
12
+ readme = "README.md"
13
+ requires-python = ">=3.13"
14
+ dependencies = ["beautifulsoup4>=4.13.4", "lxml>=6.0.0", "python-pptx>=1.0.2"]
15
+
16
+ [dependency-groups]
17
+ dev = [
18
+ "dspy>=2.6.27",
19
+ "jupyterlab>=4.4.5",
20
+ "pre-commit>=4.2.0",
21
+ "pyright>=1.1.403",
22
+ "pytest>=8.4.1",
23
+ "ruff>=0.12.4",
24
+ ]
25
+
26
+ [tool.pyright]
27
+ venvPath = "."
28
+ venv = ".venv"
29
+
30
+ [tool.ruff]
31
+ builtins = ["_"]
32
+ target-version = "py312"
33
+ extend-exclude = [
34
+ '.git',
35
+ '__pycache__',
36
+ 'build',
37
+ 'dist',
38
+ '.venv',
39
+ 'venv',
40
+ '.tox',
41
+ '.mypy_cache',
42
+ '.pytest_cache',
43
+ '*/migrations',
44
+ '*/dev-tools',
45
+ ]
46
+ line-length = 120
47
+
48
+ [tool.ruff.format]
49
+ quote-style = "single"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,50 @@
1
+ #!/usr/bin/env python3
2
+ import sys
3
+ import json
4
+ import logging
5
+ from typing import Iterable
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ def read_input(input_file: str | None = None) -> str:
11
+ """Read JSON content from a file or stdin and parse it."""
12
+ try:
13
+ if input_file:
14
+ with open(input_file, 'r', encoding='utf-8') as f:
15
+ content = f.read()
16
+ else:
17
+ content = sys.stdin.read()
18
+ except Exception as e:
19
+ logger.error(f'Error reading input from {input_file or "stdin"}', exc_info=True)
20
+ raise RuntimeError(f'Error reading input: {e}')
21
+
22
+ return content
23
+
24
+
25
+ def write_output(output: str, output_file: str | None = None):
26
+ try:
27
+ if output_file:
28
+ with open(output_file, 'w', encoding='utf-8') as f:
29
+ f.write(output)
30
+ else:
31
+ sys.stdout.write(output)
32
+ except IOError:
33
+ logger.error(f'Error writing to {output_file or "stdout"}', exc_info=True)
34
+ raise
35
+
36
+
37
+ def write_stream_of_obj(obj_stream: Iterable[dict], output_file: str | None = None):
38
+ try:
39
+ if output_file:
40
+ with open(output_file, 'w', encoding='utf-8') as f:
41
+ for obj in obj_stream:
42
+ f.write(json.dumps(obj))
43
+ f.write('\n')
44
+ else:
45
+ for obj in obj_stream:
46
+ sys.stdout.write(json.dumps(obj))
47
+ sys.stdout.write('\n')
48
+ except IOError:
49
+ logger.error(f'Error writing stream to {output_file or "stdout"}', exc_info=True)
50
+ raise