content-extraction 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,164 @@
1
+ """
2
+ HTML Section Parser - Extracts hierarchical sections from HTML content
3
+ """
4
+
5
+ from bs4 import BeautifulSoup
6
+ from bs4.element import Tag
7
+
8
+
9
+ class HTMLSectionParser:
10
+ """Fast parser for HTML that finds sections and splits content into subsections."""
11
+
12
+ def __init__(self):
13
+ self.heading_tags = {"h1", "h2", "h3", "h4", "h5", "h6"}
14
+
15
+ def get_heading_level(self, element) -> int | None:
16
+ """Extract heading level from an element."""
17
+ # Standard heading tags
18
+ if element.name in self.heading_tags:
19
+ return int(element.name[1])
20
+
21
+ # Elements with role="heading" and aria-level
22
+ if element.get("role") == "heading":
23
+ aria_level = element.get("aria-level")
24
+ if aria_level and aria_level.isdigit():
25
+ return int(aria_level)
26
+ # Default to level 1 if no aria-level specified
27
+ return 1
28
+
29
+ # Elements with aria-level (even without role="heading")
30
+ aria_level = element.get("aria-level")
31
+ if aria_level and aria_level.isdigit():
32
+ return int(aria_level)
33
+
34
+ return None
35
+
36
+ def extract_text_between_headings(
37
+ self, soup, start_element, end_element=None
38
+ ) -> str:
39
+ """Extract all content between two heading elements."""
40
+ content_parts = []
41
+ current = start_element.next_sibling
42
+
43
+ while current and current != end_element:
44
+ if isinstance(current, Tag):
45
+ # Check if this is a heading element
46
+ if (
47
+ current.name in self.heading_tags
48
+ or (current.get("role") == "heading")
49
+ or current.get("aria-level", "").isdigit()
50
+ ):
51
+ # Hit another heading, stop
52
+ break
53
+
54
+ # Check if this element contains headings (like a section)
55
+ nested_headings = self._find_headings_in_element(current)
56
+ if nested_headings:
57
+ # This element contains headings, so we should stop here
58
+ # and let those headings be processed as subsections
59
+ break
60
+
61
+ content_parts.append(str(current))
62
+ elif (
63
+ hasattr(current, "string") and current.string and current.string.strip()
64
+ ):
65
+ # It's text content
66
+ content_parts.append(current.string)
67
+ current = current.next_sibling
68
+
69
+ return "".join(content_parts).strip()
70
+
71
+ def _find_headings_in_element(self, element):
72
+ """Find all heading elements within a given element."""
73
+ headings = []
74
+ for child in element.find_all():
75
+ level = self.get_heading_level(child)
76
+ if level is not None:
77
+ headings.append((child, level))
78
+ return headings
79
+
80
+ def find_next_heading_at_level_or_higher(
81
+ self, soup, start_element, current_level: int
82
+ ):
83
+ """Find the next heading at the same level or higher."""
84
+ current = start_element.next_sibling
85
+
86
+ while current:
87
+ if isinstance(current, Tag):
88
+ level = self.get_heading_level(current)
89
+ if level is not None and level <= current_level:
90
+ return current
91
+ current = current.next_sibling
92
+
93
+ return None
94
+
95
+ def parse_sections(self, html_content: str) -> list[dict[str, object]]:
96
+ """Parse HTML and extract hierarchical sections."""
97
+ soup = BeautifulSoup(html_content, "lxml")
98
+
99
+ # Find all potential heading elements in document order
100
+ headings = []
101
+ for element in soup.find_all():
102
+ level = self.get_heading_level(element)
103
+ if level is not None:
104
+ headings.append((element, level))
105
+
106
+ if not headings:
107
+ return []
108
+
109
+ result = self._build_hierarchy(soup, headings)
110
+
111
+ # If there's only one top-level section and the test expects a single object
112
+ # (not a list), we might need to adjust this based on the specific use case
113
+ return result
114
+
115
+ def _build_hierarchy(self, soup, headings: list[tuple]) -> list[dict[str, object]]:
116
+ """Build hierarchical structure from headings."""
117
+ if not headings:
118
+ return []
119
+
120
+ result = []
121
+ i = 0
122
+
123
+ while i < len(headings):
124
+ current_element, current_level = headings[i]
125
+
126
+ # Find all headings that belong to this section (higher level numbers)
127
+ subsection_headings = []
128
+ j = i + 1
129
+
130
+ # Collect all subsections until we hit a heading at same or higher level
131
+ while j < len(headings):
132
+ next_element, next_level = headings[j]
133
+ if next_level <= current_level:
134
+ break
135
+ subsection_headings.append(headings[j])
136
+ j += 1
137
+
138
+ # Extract text content for this section
139
+ # Find the next heading at the same or higher level for boundary
140
+ next_boundary = None
141
+ if j < len(headings):
142
+ next_boundary = headings[j][0]
143
+
144
+ text_content = self.extract_text_between_headings(
145
+ soup, current_element, next_boundary
146
+ )
147
+
148
+ # Build subsections recursively
149
+ subsections = self._build_hierarchy(soup, subsection_headings)
150
+
151
+ # Build the section dictionary
152
+ section = {
153
+ "title": current_element.get_text().strip(),
154
+ "text": text_content,
155
+ "level": current_level,
156
+ "subsections": subsections,
157
+ }
158
+
159
+ result.append(section)
160
+
161
+ # Move to the next section at the same or higher level
162
+ i = j
163
+
164
+ return result
@@ -0,0 +1,134 @@
1
+ #!/usr/bin/env python3
2
+ import sys
3
+ import argparse
4
+ import hashlib
5
+ import json
6
+ import logging
7
+ from content_extraction.common_std_io import read_input, write_stream_of_obj
8
+ from dataclasses import dataclass, field, asdict
9
+
10
+ from .logging_config import setup_logging
11
+
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ @dataclass
17
+ class Node:
18
+ title: str
19
+ text: str
20
+ level: int
21
+ subsections: list['Node'] | None = field(default_factory=list)
22
+
23
+
24
+ @dataclass
25
+ class SectionDigestNode:
26
+ title: str
27
+ text: str
28
+ subsections: list['SectionDigestNode'] = field(default_factory=list)
29
+
30
+
31
+ @dataclass
32
+ class ProcessResultNode:
33
+ digest_hash: str
34
+ parent_digest_hash: str
35
+ title: str
36
+ text: str
37
+ section_digest: SectionDigestNode
38
+
39
+
40
+ def shorten_text(text: str, max_elements: int = 2, subsections: list[dict] | None = None) -> str:
41
+ """Shorten text by splitting on lines and keeping at most max_elements, appending '...' if truncated."""
42
+ if max_elements == -1:
43
+ return text
44
+
45
+ if not text:
46
+ result = ''
47
+ for child in subsections or []:
48
+ result = '<p>Covered topics in this subsection:</p><ul>'
49
+ for child in subsections or []:
50
+ result += f'<li>{child.get("title")}</li>'
51
+ result += '</ul>'
52
+ return result
53
+
54
+ DELIM = ''
55
+ lines = text.splitlines()
56
+ if len(lines) <= max_elements:
57
+ if subsections:
58
+ lines.append('...')
59
+ return DELIM.join(lines)
60
+ shortened = lines[:max_elements]
61
+ shortened.append('...')
62
+ return DELIM.join(shortened)
63
+
64
+
65
+ def generate_section_digest(node: dict) -> SectionDigestNode:
66
+ """Generate a section digest string for a node, including its title/text and immediate children."""
67
+ text = node.get('text', '')
68
+ section_digest = SectionDigestNode(title=node.get('title', ''), text=text)
69
+ # Include immediate children
70
+ for child in node.get('subsections') or []:
71
+ child_title = child.get('title')
72
+ child_text = child.get('text')
73
+ length = 1 if text else -1
74
+ short_text = shorten_text(child_text, length, child.get('subsections'))
75
+ section_digest.subsections.append(SectionDigestNode(title=child_title, text=short_text))
76
+ return section_digest
77
+
78
+
79
+ def compute_digest_hash(section_digest: SectionDigestNode) -> str:
80
+ """Compute a BLAKE2b hash of the section digest text as the node ID."""
81
+ h = hashlib.blake2b(digest_size=16)
82
+ section_digest_text = str(section_digest)
83
+ h.update(section_digest_text.encode('utf-8'))
84
+ return h.hexdigest()
85
+
86
+
87
+ def process_node(node: dict, parent_digest_hash: str | None = None) -> list[dict]:
88
+ """
89
+ Recursively process a node and its subsections, returning a flat list of nodes.
90
+ """
91
+ section_digest = generate_section_digest(node)
92
+ digest_hash = compute_digest_hash(section_digest)
93
+ result = ProcessResultNode(
94
+ **{
95
+ 'digest_hash': digest_hash,
96
+ 'parent_digest_hash': parent_digest_hash,
97
+ 'title': node.get('title'),
98
+ 'text': node.get('text'),
99
+ 'section_digest': section_digest,
100
+ }
101
+ )
102
+ result = asdict(result)
103
+ nodes = [result]
104
+ for child in node.get('subsections') or []:
105
+ nodes += process_node(child, parent_digest_hash=digest_hash)
106
+ return nodes
107
+
108
+
109
+ def main():
110
+ parser = argparse.ArgumentParser(
111
+ description=('Split hierarchical JSON into JSON Lines with node summaries and parent digests.')
112
+ )
113
+ parser.add_argument('input', nargs='?', help='Input JSON file (defaults to stdin)')
114
+ parser.add_argument('-o', '--output', help='Output JSONL file (defaults to stdout)')
115
+ args = parser.parse_args()
116
+
117
+ setup_logging()
118
+
119
+ logger.info(f'Processing input from {args.input or "stdin"}')
120
+ content = read_input(args.input)
121
+ data_list = json.loads(content)
122
+ logger.info(f'Found {len(data_list)} top-level sections to process.')
123
+
124
+ nodes = [
125
+ node
126
+ for result_list in (process_node(data, parent_digest_hash=None) for data in data_list)
127
+ for node in result_list
128
+ ]
129
+ write_stream_of_obj(nodes, args.output)
130
+ logger.info(f'Successfully processed and wrote {len(nodes)} nodes to {args.output or "stdout"}.')
131
+
132
+
133
+ if __name__ == '__main__':
134
+ sys.exit(main())
@@ -0,0 +1,258 @@
1
+ Metadata-Version: 2.4
2
+ Name: content_extraction
3
+ Version: 0.1.0
4
+ Summary: Project dedicated to content extraction from unstructured files that contain some useful information.
5
+ Requires-Python: >=3.13
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: beautifulsoup4>=4.13.4
8
+ Requires-Dist: lxml>=6.0.0
9
+ Requires-Dist: python-pptx>=1.0.2
10
+
11
+ # HTML Content Extraction Tool
12
+
13
+ A powerful command-line tool for extracting structured content from HTML documents. Converts HTML sections into hierarchical JSON data while preserving formatting, links, and semantic structure.
14
+
15
+ ## Features
16
+
17
+ - **Hierarchical Parsing**: Automatically detects heading levels and creates nested section structures
18
+ - **HTML Preservation**: Maintains original formatting, links, and semantic elements
19
+ - **Smart Element Filtering**: Includes meaningful content while filtering out irrelevant elements
20
+ - **Flexible Input/Output**: Read from files or stdin, output to files or stdout
21
+ - **Section Support**: Works with existing `<section>`, `<article>`, and `<main>` elements
22
+ - **Custom Headings**: Supports both standard headings (`h1`-`h6`) and custom headings with `aria-level`
23
+
24
+ ## Installation
25
+
26
+ ```bash
27
+ # Install dependencies
28
+ pip install beautifulsoup4
29
+
30
+ # Clone or download this repository
31
+ git clone <repository-url>
32
+ cd content-extraction
33
+ ```
34
+
35
+ ## Usage
36
+
37
+ ### Basic Usage
38
+
39
+ ```bash
40
+ # Parse HTML file and output to stdout
41
+ python main.py example.html
42
+
43
+ # Parse with pretty-printed JSON
44
+ python main.py --pretty example.html
45
+
46
+ # Save output to file
47
+ python main.py example.html -o output.json
48
+
49
+ # Read from stdin
50
+ cat example.html | python main.py --pretty
51
+
52
+ # Verbose mode with debug information
53
+ python main.py --verbose example.html
54
+ ```
55
+
56
+ ### Command Line Options
57
+
58
+ ```
59
+ usage: main.py [-h] [-o FILE] [--pretty] [-v] [--version] [input_file]
60
+
61
+ Extract structured content from HTML documents
62
+
63
+ positional arguments:
64
+ input_file Input HTML file (if not provided, reads from stdin)
65
+
66
+ options:
67
+ -h, --help show this help message and exit
68
+ -o, --output FILE Output JSON file (if not provided, writes to stdout)
69
+ --pretty Pretty-print JSON output with indentation
70
+ -v, --verbose Show verbose output and debug information
71
+ --version show program's version number and exit
72
+ ```
73
+
74
+ ## Output Format
75
+
76
+ The tool outputs JSON with the following structure:
77
+
78
+ ```json
79
+ {
80
+ "title": "Section Title",
81
+ "text": "<p>HTML content preserved</p>",
82
+ "level": 1,
83
+ "subsections": [
84
+ {
85
+ "title": "Subsection Title",
86
+ "text": "<p>Subsection content</p>",
87
+ "level": 2,
88
+ "subsections": []
89
+ }
90
+ ]
91
+ }
92
+ ```
93
+
94
+ ### Fields
95
+
96
+ - **`title`**: Text content of the highest-level heading in the section
97
+ - **`text`**: All content except headings, with HTML formatting preserved
98
+ - **`level`**: Aria level of the main heading (1-6, or custom levels)
99
+ - **`subsections`**: Array of nested subsections with the same structure
100
+
101
+ ## Examples
102
+
103
+ ### Simple Section
104
+
105
+ **Input HTML:**
106
+ ```html
107
+ <section>
108
+ <h2>Getting Started</h2>
109
+ <p>Welcome to our <a href="/api">API</a>!</p>
110
+ <ul>
111
+ <li>Step 1: Register</li>
112
+ <li>Step 2: Get API key</li>
113
+ </ul>
114
+ </section>
115
+ ```
116
+
117
+ **Output:**
118
+ ```json
119
+ {
120
+ "title": "Getting Started",
121
+ "text": "<p>Welcome to our <a href=\"/api\">API</a>!</p>\n<ul>\n<li>Step 1: Register</li>\n<li>Step 2: Get API key</li>\n</ul>",
122
+ "level": 2,
123
+ "subsections": []
124
+ }
125
+ ```
126
+
127
+ ### Nested Sections
128
+
129
+ **Input HTML:**
130
+ ```html
131
+ <main>
132
+ <h1>Documentation</h1>
133
+ <p>Introduction text.</p>
134
+ <h2>Installation</h2>
135
+ <p>Installation instructions.</p>
136
+ <h3>Requirements</h3>
137
+ <p>System requirements.</p>
138
+ <h2>Usage</h2>
139
+ <p>Usage examples.</p>
140
+ </main>
141
+ ```
142
+
143
+ **Output:**
144
+ ```json
145
+ {
146
+ "title": "Documentation",
147
+ "text": "<p>Introduction text.</p>",
148
+ "level": 1,
149
+ "subsections": [
150
+ {
151
+ "title": "Installation",
152
+ "text": "<p>Installation instructions.</p>",
153
+ "level": 2,
154
+ "subsections": [
155
+ {
156
+ "title": "Requirements",
157
+ "text": "<p>System requirements.</p>",
158
+ "level": 3,
159
+ "subsections": []
160
+ }
161
+ ]
162
+ },
163
+ {
164
+ "title": "Usage",
165
+ "text": "<p>Usage examples.</p>",
166
+ "level": 2,
167
+ "subsections": []
168
+ }
169
+ ]
170
+ }
171
+ ```
172
+
173
+ ## Supported HTML Elements
174
+
175
+ ### Included Elements
176
+ - Paragraphs (`<p>`)
177
+ - Lists (`<ul>`, `<ol>`, `<li>`)
178
+ - Links (`<a>`)
179
+ - Formatting (`<strong>`, `<em>`, `<code>`, etc.)
180
+ - Semantic elements (`<section>`, `<article>`, `<aside>`, etc.)
181
+ - Tables (`<table>`, `<tr>`, `<td>`, etc.)
182
+ - Media (`<img>`, `<figure>`)
183
+ - Code blocks (`<pre>`, `<code>`)
184
+ - Quotes (`<blockquote>`, `<q>`)
185
+ - All other content elements with meaningful text
186
+
187
+ ### Excluded Elements
188
+ - Headings (processed separately as section titles)
189
+ - Script and style tags
190
+ - Meta elements
191
+ - Empty elements
192
+ - Elements containing headings (processed as subsections)
193
+
194
+ ## Smart Root Element Detection
195
+
196
+ The tool automatically detects the best root element in this priority order:
197
+
198
+ 1. `<main>` - Primary content area
199
+ 2. `<article>` - Standalone article content
200
+ 3. `<section>` - Document section
201
+ 4. `<body>` - Document body
202
+ 5. First substantial `<div>` - Fallback for div-based layouts
203
+ 6. Entire document - Last resort
204
+
205
+ ## Advanced Features
206
+
207
+ ### Custom Headings
208
+ Supports custom headings with ARIA attributes:
209
+
210
+ ```html
211
+ <div role="heading" aria-level="2">Custom Heading</div>
212
+ ```
213
+
214
+ ### Aria Level Overrides
215
+ Standard headings can have their levels overridden:
216
+
217
+ ```html
218
+ <h3 aria-level="1">This is treated as level 1</h3>
219
+ ```
220
+
221
+ ### Mixed Content
222
+ Handles complex layouts with mixed content types:
223
+
224
+ ```html
225
+ <div>
226
+ <h1>Main Title</h1>
227
+ <p>Introduction</p>
228
+ <section>
229
+ <h2>Section in Section</h2>
230
+ <p>Section content</p>
231
+ </section>
232
+ <h2>Regular Heading</h2>
233
+ <p>Regular content</p>
234
+ </div>
235
+ ```
236
+
237
+ ## Testing
238
+
239
+ Run the test suite:
240
+
241
+ ```bash
242
+ python -m pytest tests/ -v
243
+ ```
244
+
245
+ The project includes comprehensive tests covering:
246
+ - Basic parsing functionality
247
+ - Heading level detection
248
+ - Content extraction
249
+ - Section handling
250
+ - Edge cases and error conditions
251
+
252
+ ## License
253
+
254
+ This project is open source. See LICENSE file for details.
255
+
256
+ ## Contributing
257
+
258
+ Contributions are welcome! Please submit pull requests with tests for any new features.
@@ -0,0 +1,15 @@
1
+ content_extraction/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ content_extraction/common_std_io.py,sha256=mSRaiI4OrnttEQ8Y92-LsJnAHEI3xLKnJvmXDHmkfWc,1547
3
+ content_extraction/do_ocr.py,sha256=lrqwPYQlPuUHabirH_RzKbzHgYUPPpNeHDe_u4h9LEY,6886
4
+ content_extraction/dspy_modules.py,sha256=0aAokJQNzczfowoUNK3BPMi_U18eXM9thHvciWaE5b0,732
5
+ content_extraction/extract_from_pptx.py,sha256=IWd81sn7ZsyaQZdXP5Cgbk7GspcDYEjMnBkti-pTHQY,6572
6
+ content_extraction/file_handlers.py,sha256=mO4HWiA_ZEKkV8KZP4fOz_nGnxDpghkqAhS0ADG9Oqk,11149
7
+ content_extraction/fix_ocr.py,sha256=2xJ4c3VsGSy1l-qAukvhaV8QOp6yu5BY99Gb0DwamWQ,8009
8
+ content_extraction/logging_config.py,sha256=GN1wuJJEspQ3z-FZIg134obsHweuiicZfz2an13a9_I,296
9
+ content_extraction/parse_html.py,sha256=mOrZKXX59YcdWWhmbnoTnfXpwrg0znk38x0DMJIVes8,3137
10
+ content_extraction/semantic_chunk_html.py,sha256=iJPspKkrt95lL46JpC_9fgT8GfV8cz04TWEnU99rbBw,5786
11
+ content_extraction/split_and_create_digest.py,sha256=bKZL9Axc74zLH_VrlNjd46ZiVTQQrAY5iNJCotO-8v8,4253
12
+ content_extraction-0.1.0.dist-info/METADATA,sha256=3dQRIhF8zxiifsp3Fxpo8BCKqvV9N3xtjyCAkNlwQ_I,6201
13
+ content_extraction-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
+ content_extraction-0.1.0.dist-info/top_level.txt,sha256=a0I0EwSzsyd3p_aAENozn9i4I3aBn12XtrbqIvfzZec,19
15
+ content_extraction-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ content_extraction