content-extraction 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,280 @@
1
+ import os
2
+ import shutil
3
+ import subprocess
4
+ import tempfile
5
+ import mimetypes
6
+ import logging
7
+ from urllib.parse import urlparse
8
+
9
+ import requests
10
+
11
+ from content_extraction.extract_from_pptx import extract_content as extract_pptx_content
12
+ from content_extraction.semantic_chunk_html import HTMLSectionParser
13
+ from content_extraction.common_std_io import write_stream_of_obj
14
+ from content_extraction.split_and_create_digest import process_node
15
+
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class FileHandlerError(Exception):
21
+ """Custom exception for file handling errors."""
22
+
23
+
24
+ def _convert_with_pandoc(file_path: str, output_dir: str):
25
+ """Helper function to run pandoc for different file types."""
26
+ output_html_path = os.path.join(output_dir, 'index.html')
27
+ try:
28
+ subprocess.run(
29
+ ['pandoc', file_path, '-s', '-o', output_html_path],
30
+ check=True,
31
+ capture_output=True,
32
+ text=True,
33
+ encoding='utf-8',
34
+ )
35
+ return output_html_path
36
+ except FileNotFoundError:
37
+ error_msg = 'Error: `pandoc` command not found. Please ensure pandoc is installed and in your PATH.'
38
+ logger.error(error_msg)
39
+ raise FileHandlerError(error_msg)
40
+ except subprocess.CalledProcessError as e:
41
+ logger.error(f'Error converting {file_path} to HTML: {e.stderr}')
42
+ raise FileHandlerError(f'Pandoc conversion failed for {file_path}') from e
43
+
44
+
45
+ def process_pdf(file_path: str, output_dir: str):
46
+ """
47
+ Handles PDF files by running the main processing script.
48
+ The script is expected to convert the PDF to HTML and place it as index.html
49
+ in the output_dir.
50
+ """
51
+ logger.info(f'[Processing PDF file] started for: "{file_path}"')
52
+ # This path assumes the script is located at src/scripts/process_document.sh
53
+ script_path = os.path.join(os.path.dirname(__file__), '..', 'scripts', 'process_document.sh')
54
+ output_html_path = os.path.join(output_dir, 'index.html') # Define output_html_path
55
+ logger.debug(f'[Processing PDF file] script path: "{script_path}"; output_html_path: "{output_html_path}"')
56
+
57
+ if not os.path.exists(script_path):
58
+ raise FileNotFoundError(f'Processing script not found at: {script_path}')
59
+
60
+ # Ensure the script is executable
61
+ if not os.access(script_path, os.X_OK):
62
+ logger.warning(f'Script {script_path} is not executable. Attempting to set permissions.')
63
+ try:
64
+ os.chmod(script_path, 0o755)
65
+ except OSError as e:
66
+ raise FileHandlerError(f'Failed to set executable permissions for {script_path}: {e}')
67
+
68
+ try:
69
+ # The script is expected to take input_file and output_directory as arguments
70
+ subprocess.run(
71
+ [script_path, file_path, output_dir],
72
+ check=True, # Raise CalledProcessError if the command returns a non-zero exit code
73
+ capture_output=True, # Capture stdout and stderr
74
+ text=True, # Decode stdout/stderr as text
75
+ encoding='utf-8',
76
+ )
77
+ if not os.path.exists(output_html_path):
78
+ raise FileHandlerError(
79
+ f'Processing script {script_path} completed, but did not produce the expected output file: {output_html_path}'
80
+ )
81
+ except subprocess.CalledProcessError as e:
82
+ logger.error(f'Error processing PDF with script: {e.stderr}')
83
+ raise FileHandlerError(f'PDF processing script failed for {file_path}') from e
84
+
85
+ logger.info(f'[Processing PDF file] completed for: "{file_path}"')
86
+ return output_html_path
87
+
88
+
89
+ def process_pptx(file_path: str, output_dir: str):
90
+ """
91
+ Handles PowerPoint files using the existing pptx extraction function.
92
+ """
93
+ logger.info(f'[Processing PPTX file] started for: "{file_path}"')
94
+ html_out = extract_pptx_content(file_path, output_dir)
95
+ if not html_out:
96
+ raise FileHandlerError(f'Failed to extract content from {file_path}')
97
+
98
+ # Standardize the output filename to index.html
99
+ standard_path = os.path.join(output_dir, 'index.html')
100
+ if os.path.abspath(html_out) != os.path.abspath(standard_path):
101
+ shutil.move(html_out, standard_path)
102
+ logger.info(f'[Processing PPTX file] completed for: "{file_path}"')
103
+ return standard_path
104
+
105
+
106
+ def process_docx(file_path: str, output_dir: str):
107
+ """Handles Word documents by converting them to HTML using pandoc."""
108
+ logger.info(f'[Processing DOCX file] started for: "{file_path}"')
109
+ result = _convert_with_pandoc(file_path, output_dir)
110
+ logger.info(f'[Processing DOCX file] completed for: "{file_path}"')
111
+ return result
112
+
113
+
114
+ def process_markdown(file_path: str, output_dir: str):
115
+ """Handles Markdown files by converting them to HTML using pandoc."""
116
+ logger.info(f'[Processing Markdown file] started for: "{file_path}"')
117
+ result = _convert_with_pandoc(file_path, output_dir)
118
+ logger.info(f'[Processing Markdown file] completed for: "{file_path}"')
119
+ return result
120
+
121
+
122
+ def process_html(file_path: str, output_dir: str):
123
+ """
124
+ Handles HTML files by copying them to the output directory with the standard name.
125
+ """
126
+ logger.info(f'[Processing HTML file] started for: "{file_path}"')
127
+ dest_path = os.path.join(output_dir, 'index.html')
128
+ if os.path.abspath(file_path) != os.path.abspath(dest_path):
129
+ shutil.move(file_path, dest_path)
130
+ logger.info(f'[Processing HTML file] completed for: "{file_path}"')
131
+ return dest_path
132
+
133
+
134
+ def handle_url(url: str, output_dir: str, force_ext: str = ''):
135
+ """
136
+ Handles a URL by determining the file type and using the most efficient
137
+ processing method.
138
+ """
139
+ logger.info(f'[Processing URL] started for: "{url}"')
140
+ file_ext = None
141
+
142
+ if force_ext:
143
+ file_ext = f'.{force_ext.lstrip(".")}'
144
+ else:
145
+ try:
146
+ response = requests.head(url, timeout=15, allow_redirects=True)
147
+ response.raise_for_status()
148
+ content_type = response.headers.get('content-type')
149
+ if content_type:
150
+ mime_type = content_type.split(';')[0].strip()
151
+ file_ext = mimetypes.guess_extension(mime_type)
152
+
153
+ if not file_ext or file_ext in ['.bin']:
154
+ parsed_url = urlparse(url)
155
+ _, ext_from_url = os.path.splitext(parsed_url.path)
156
+ if ext_from_url:
157
+ file_ext = ext_from_url
158
+
159
+ except requests.RequestException as e:
160
+ raise FileHandlerError(f'Failed to retrieve headers from URL {url}: {e}') from e
161
+
162
+ if not file_ext or file_ext.lower() not in EXTENSION_HANDLERS:
163
+ logger.warning(f'Could not determine a supported file type for {url}. Defaulting to HTML.')
164
+ file_ext = '.html'
165
+
166
+ # Download to a temporary file for all types except HTML, which is streamed.
167
+ if file_ext == '.html':
168
+ output_html_path = os.path.join(output_dir, 'index.html')
169
+ try:
170
+ with requests.get(url, stream=True, timeout=60) as r:
171
+ r.raise_for_status()
172
+ with open(output_html_path, 'wb') as f:
173
+ for chunk in r.iter_content(chunk_size=8192):
174
+ f.write(chunk)
175
+ return output_html_path
176
+ except requests.RequestException as e:
177
+ raise FileHandlerError(f'Failed to download HTML content from {url}: {e}')
178
+
179
+ handler_func = EXTENSION_HANDLERS.get(file_ext.lower())
180
+ if not handler_func:
181
+ raise FileHandlerError(f"No handler found for file type '{file_ext}' from URL {url}")
182
+
183
+ temp_file_path = None
184
+ try:
185
+ with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as temp_file:
186
+ temp_file_path = temp_file.name
187
+
188
+ with requests.get(url, stream=True, timeout=120) as r:
189
+ r.raise_for_status()
190
+ for chunk in r.iter_content(chunk_size=8192):
191
+ temp_file.write(chunk)
192
+ except requests.RequestException as e:
193
+ raise FileHandlerError(f'Failed to download content from {url}: {e}') from e
194
+
195
+ logger.info(f'[Processing URL] completed for: "{url}"')
196
+ return handler_func(temp_file_path, output_dir)
197
+
198
+
199
+ # Mapping of file extensions to handler functions
200
+ EXTENSION_HANDLERS = {
201
+ '.pdf': process_pdf,
202
+ '.pptx': process_pptx,
203
+ '.docx': process_docx,
204
+ '.md': process_markdown,
205
+ '.html': process_html,
206
+ }
207
+
208
+
209
+ def get_handler(input_path: str, force_ext: str = ''):
210
+ """
211
+ Determines and returns the correct file handler function based on the input.
212
+ """
213
+ if input_path.startswith(('http://', 'https://')):
214
+ return lambda output_dir: handle_url(input_path, output_dir, force_ext)
215
+
216
+ if not os.path.exists(input_path):
217
+ raise FileNotFoundError(f'Input file not found: {input_path}')
218
+
219
+ _, ext = os.path.splitext(input_path)
220
+ file_ext = f'.{force_ext.lstrip(".")}' if force_ext else ext
221
+
222
+ if not file_ext:
223
+ raise ValueError('File has no extension, and --force-ext was not provided.')
224
+
225
+ handler_func = EXTENSION_HANDLERS.get(file_ext.lower())
226
+
227
+ if not handler_func:
228
+ logger.error(f'Unsupported file type: {file_ext}')
229
+ raise ValueError(f'Unsupported file type: {file_ext}')
230
+
231
+ return lambda output_dir: handler_func(input_path, output_dir)
232
+
233
+
234
+ def process_file(input_path: str, output_dir: str, force_ext: str = '') -> str:
235
+ """
236
+ Main entry point for processing a file or URL.
237
+ It identifies the file type, runs the appropriate handler, and returns the path to the final processed HTML file.
238
+ """
239
+ os.makedirs(output_dir, exist_ok=True)
240
+ logger.info(f'[Processing File] Retrieving correct parser for "{input_path}"')
241
+ handler = get_handler(input_path, force_ext)
242
+ try:
243
+ final_html_path = handler(output_dir)
244
+ except FileHandlerError as e:
245
+ logger.error(
246
+ f'[Processing File] Processing failed to produce an output file for "{input_path}"',
247
+ extra={'error': str(e)},
248
+ )
249
+ raise
250
+
251
+ if not final_html_path or not os.path.exists(final_html_path):
252
+ raise FileHandlerError(f"[Processing File] Processing failed to produce an output file for '{input_path}'")
253
+
254
+ logger.info(f'[Processing File] Reading generated HTML file in "{final_html_path}"')
255
+ try:
256
+ with open(final_html_path, 'r', encoding='utf-8') as f:
257
+ html_content = f.read()
258
+ except Exception as e:
259
+ logger.error(
260
+ f'[Processing File] Failed to read the generated HTML file at {final_html_path}',
261
+ extra={'error': str(e)},
262
+ )
263
+ raise
264
+
265
+ logger.info('[Processing File] Parsing HTML into sections.')
266
+ parser = HTMLSectionParser()
267
+ parsed_sections = parser.parse_sections(html_content)
268
+
269
+ logger.info('[Processing File] Splitting parsed sections and creating JSON digest.')
270
+ jsonl_output_path = os.path.join(output_dir, 'sections.jsonl')
271
+
272
+ all_nodes = []
273
+ if parsed_sections:
274
+ for section in parsed_sections:
275
+ all_nodes.extend(process_node(section, parent_digest_hash=None))
276
+
277
+ write_stream_of_obj(all_nodes, jsonl_output_path)
278
+ logger.info(f'[Processing File] Successfully created JSON digest at {jsonl_output_path}')
279
+
280
+ return final_html_path
@@ -0,0 +1,245 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Markdown File Formatting Automation Script
4
+
5
+ This script automates two specific formatting fixes in a Markdown file:
6
+ 1. Adjusting heading levels based on numerical hierarchy.
7
+ 2. Formatting the REFERENCES section with consistent spacing.
8
+
9
+ It operates as a command-line tool, reading from a file or stdin and
10
+ writing to a file or stdout, in a standard UNIX-like fashion.
11
+ """
12
+
13
+ import re
14
+ import argparse
15
+ import difflib
16
+ from typing import Iterable
17
+ import sys
18
+ import logging
19
+
20
+ from content_extraction.common_std_io import write_output
21
+ from .logging_config import setup_logging
22
+
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ def adjust_headings(lines):
28
+ """
29
+ Adjusts heading levels to match numerical hierarchy and separates paragraphs.
30
+
31
+ This function yields lines of text, ensuring appropriate spacing around
32
+ headings and their associated paragraphs.
33
+
34
+ Args:
35
+ lines (list): A list of strings, where each string is a line from the input.
36
+
37
+ Yields:
38
+ str: The processed lines of text.
39
+ """
40
+ HEADING_PATTERN = re.compile(r'^(#*)\s+([A-Z]\s?(?:\.\s*\d+)*|\d+(?:\.\s*\d+)*)\s*(.*)$')
41
+ ORDERED_LIST_PATTERN = re.compile(r'^\s*\d+\.\s')
42
+
43
+ for line in lines:
44
+ # If the line is a numbered list item, leave it as is.
45
+ if ORDERED_LIST_PATTERN.match(line):
46
+ yield line
47
+ continue
48
+
49
+ match = HEADING_PATTERN.match(line)
50
+
51
+ if not match:
52
+ stripped_line = line.lstrip()
53
+ if stripped_line.startswith('#'):
54
+ # Default non-matching headings to level 4
55
+ text = stripped_line.lstrip('#').strip()
56
+ yield ''
57
+ yield '#### ' + text
58
+ continue
59
+
60
+ # Keep non-matching lines as is
61
+ yield line
62
+ continue
63
+
64
+ # A candidate heading was found
65
+ _, heading_number, heading_text = match.groups()
66
+
67
+ # Determine correct markdown level
68
+ parts = heading_number.split('.')
69
+ desired_hashes_count = len(parts) + 1
70
+
71
+ # Process heading text to separate title from a potential inline paragraph.
72
+ heading_text_stripped = heading_text.strip()
73
+ clean_title = heading_text_stripped
74
+ paragraph = None
75
+
76
+ if '.' in heading_text_stripped:
77
+ title_parts = heading_text_stripped.split('.', 1)
78
+ potential_paragraph = title_parts[1].strip()
79
+
80
+ # If the part after the period has letters, it's a paragraph.
81
+ if potential_paragraph and potential_paragraph[0].isalpha():
82
+ clean_title = title_parts[0].strip() + '.'
83
+ paragraph = potential_paragraph
84
+
85
+ # Yield a blank line before the new heading for spacing
86
+ yield ''
87
+ # Construct and yield the new heading line
88
+ new_heading_line = '#' * desired_hashes_count + ' ' + heading_number + ' ' + clean_title
89
+ yield new_heading_line
90
+
91
+ if paragraph:
92
+ # Yield a blank line between heading and its paragraph, then the paragraph
93
+ yield ''
94
+ yield paragraph
95
+
96
+
97
+ def format_references(lines):
98
+ """
99
+ Format the REFERENCES section with consistent spacing.
100
+
101
+ Args:
102
+ lines (list): List of lines from the input file.
103
+
104
+ Returns:
105
+ list: Modified lines with properly formatted references.
106
+ """
107
+ modified_lines = []
108
+ in_references_section = False
109
+
110
+ for line in lines:
111
+ # Check if we're entering the REFERENCES section
112
+ if line.strip() == '# REFERENCES':
113
+ in_references_section = True
114
+ modified_lines.append(line)
115
+ continue
116
+
117
+ if in_references_section:
118
+ stripped = line.strip()
119
+ if stripped: # If the line is not empty
120
+ modified_lines.append(stripped)
121
+ # Append one blank line after each non-empty line
122
+ modified_lines.append('')
123
+ else:
124
+ # Not in references section
125
+ modified_lines.append(line)
126
+
127
+ return modified_lines
128
+
129
+
130
+ def process_science_paper(text_file_content: str, heading_file_content: str):
131
+ """
132
+ Process markdown content with both formatting fixes.
133
+
134
+ """
135
+ lines = text_file_content.splitlines()
136
+
137
+ adjusted_lines_generator = adjust_headings(lines)
138
+
139
+ # Consume the generator to pass a list to the next function
140
+ formatted_lines = format_references(list(adjusted_lines_generator))
141
+
142
+ # Join lines back into a single string with a trailing newline
143
+ return '\n'.join(formatted_lines) + '\n'
144
+
145
+
146
+ def parse_ndiff(diff_lines: Iterable[str]) -> list[tuple[str, str]]:
147
+ """
148
+ Turn an ndiff iterable into a list of (old_line, new_line) patches.
149
+
150
+ Only pairs up “- old” followed by “+ new” within the same hunk.
151
+ """
152
+ patches: list[tuple[str, str]] = []
153
+ pending_old = None
154
+
155
+ for line in diff_lines:
156
+ if line.startswith('- '):
157
+ pending_old = line[2:]
158
+ elif line.startswith('+ ') and pending_old is not None:
159
+ patches.append((pending_old, line[2:]))
160
+ pending_old = None
161
+ elif line.startswith(' ') or not line:
162
+ patches.append((pending_old or '', ''))
163
+ pending_old = None
164
+
165
+ return patches
166
+
167
+
168
+ def apply_heading_patches(ocr_text: str, diff_lines: Iterable[str]) -> str:
169
+ """
170
+ Apply heading corrections from an ndiff iterable to the OCR text.
171
+
172
+ For each (old, new) patch, replace the first exact match of old in the OCR
173
+ text with new.
174
+ """
175
+ patches = parse_ndiff(diff_lines)
176
+ lines = ocr_text.splitlines()
177
+
178
+ for old_heading, new_heading in patches:
179
+ for idx, line in enumerate(lines):
180
+ if line == old_heading:
181
+ lines[idx] = new_heading
182
+ break
183
+
184
+ return '\n'.join(lines)
185
+
186
+
187
+ def process_general_paper(text_file_content: str, heading_file_content: str) -> str:
188
+ from content_extraction.dspy_modules import CorrectHeadingLevel
189
+
190
+ heading_corrector = CorrectHeadingLevel()
191
+ pred = heading_corrector(heading_file_content)
192
+ corrected_headings = pred.corrected_headings
193
+ with open('corrected_headings.txt', 'w') as f:
194
+ f.write(corrected_headings)
195
+ diff = difflib.ndiff(heading_file_content.splitlines(), corrected_headings.splitlines())
196
+ fixed_text = apply_heading_patches(text_file_content, diff)
197
+ return fixed_text
198
+
199
+
200
+ def main():
201
+ """Main function to handle command line arguments and execute the script."""
202
+ setup_logging()
203
+ parser = argparse.ArgumentParser(
204
+ description='Automate markdown file formatting fixes.',
205
+ formatter_class=argparse.RawDescriptionHelpFormatter,
206
+ epilog="""
207
+ Examples:
208
+ %(prog)s input.md # Parse file, output to stdout
209
+ %(prog)s -o output.md input.md # Parse file, save to file
210
+ cat input.md | %(prog)s # Parse from stdin
211
+ %(prog)s --verbose input.md # Show debug information
212
+ """,
213
+ )
214
+ parser.add_argument('ocr_input_file', help='Path to input markdown file')
215
+ parser.add_argument('headings_input_file', help='Path to markdown file with headings')
216
+ parser.add_argument(
217
+ '-o',
218
+ '--output',
219
+ help='Path to output markdown file (if not provided, writes to stdout)',
220
+ )
221
+ parser.add_argument(
222
+ '--science_paper',
223
+ action='store_true',
224
+ help='Indicates that the input is a science paper. Parsing optimized for scientific papers.',
225
+ )
226
+ args = parser.parse_args()
227
+
228
+ with open(args.ocr_input_file, 'r') as f:
229
+ markdown_content = f.read()
230
+
231
+ with open(args.headings_input_file, 'r') as f:
232
+ headings_content = f.read()
233
+
234
+ # Process the markdown content
235
+ if args.science_paper:
236
+ processed_content = process_science_paper(markdown_content, headings_content)
237
+ else:
238
+ processed_content = process_general_paper(markdown_content, headings_content)
239
+
240
+ # Write output to file or stdout
241
+ write_output(processed_content, args.output)
242
+
243
+
244
+ if __name__ == '__main__':
245
+ sys.exit(main())
@@ -0,0 +1,13 @@
1
+ import logging
2
+ import sys
3
+
4
+
5
+ def setup_logging(level=logging.INFO):
6
+ """
7
+ Set up basic logging for the application.
8
+ """
9
+ logging.basicConfig(
10
+ level=level,
11
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
12
+ stream=sys.stdout, # Log to stdout
13
+ )
@@ -0,0 +1,117 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ HTML Content Extraction CLI
4
+
5
+ A command-line tool for extracting structured content from HTML documents.
6
+ Converts HTML sections into hierarchical JSON data with preserved formatting.
7
+
8
+ Usage:
9
+ python main.py [options] [input_file]
10
+
11
+ Examples:
12
+ # Read from stdin, output to stdout
13
+ cat example.html | python main.py
14
+
15
+ # Read from file, output to stdout
16
+ python main.py input.html
17
+
18
+ # Read from stdin, output to file
19
+ python main.py -o output.json
20
+
21
+ # Read from file, output to file
22
+ python main.py input.html -o output.json
23
+
24
+ # Pretty print JSON output
25
+ python main.py --pretty input.html
26
+
27
+ # Verbose mode with debug information
28
+ python main.py --verbose input.html
29
+ """
30
+
31
+ import sys
32
+ import argparse
33
+ import json
34
+ import logging
35
+
36
+ from content_extraction.common_std_io import read_input, write_output
37
+ from content_extraction.semantic_chunk_html import HTMLSectionParser
38
+ from .logging_config import setup_logging
39
+
40
+
41
+ logger = logging.getLogger(__name__)
42
+
43
+
44
+ def main():
45
+ """Main CLI entry point."""
46
+ parser = argparse.ArgumentParser(
47
+ description='Extract structured content from HTML documents',
48
+ formatter_class=argparse.RawDescriptionHelpFormatter,
49
+ epilog="""
50
+ Examples:
51
+ %(prog)s input.html # Parse file, output to stdout
52
+ %(prog)s -o output.json input.html # Parse file, save to JSON
53
+ cat input.html | %(prog)s # Parse from stdin
54
+ %(prog)s --pretty input.html # Pretty-printed JSON output
55
+ %(prog)s --verbose input.html # Show debug information
56
+ """,
57
+ )
58
+
59
+ parser.add_argument(
60
+ 'input_file',
61
+ nargs='?',
62
+ help='Input HTML file (if not provided, reads from stdin)',
63
+ )
64
+
65
+ parser.add_argument(
66
+ '-o',
67
+ '--output',
68
+ metavar='FILE',
69
+ help='Output JSON file (if not provided, writes to stdout)',
70
+ )
71
+
72
+ parser.add_argument(
73
+ '--pretty',
74
+ action='store_true',
75
+ help='Pretty-print JSON output with indentation',
76
+ )
77
+
78
+ parser.add_argument(
79
+ '-v',
80
+ '--verbose',
81
+ action='store_true',
82
+ help='Show verbose output and debug information',
83
+ )
84
+
85
+ parser.add_argument('--version', action='version', version='%(prog)s 1.0.0')
86
+
87
+ args = parser.parse_args()
88
+ setup_logging(level=logging.DEBUG if args.verbose else logging.INFO)
89
+
90
+ try:
91
+ # Read input
92
+ if args.input_file:
93
+ logger.debug(f'Reading from file: {args.input_file}')
94
+ else:
95
+ logger.debug('Reading from stdin...')
96
+
97
+ html_content = read_input(args.input_file)
98
+
99
+ # Parse HTML
100
+ parser = HTMLSectionParser()
101
+ result = parser.parse_sections(html_content)
102
+
103
+ # Write output
104
+ write_output(json.dumps(result), args.output)
105
+
106
+ logger.debug('Processing completed successfully')
107
+
108
+ except KeyboardInterrupt:
109
+ logger.warning('Operation cancelled by user')
110
+ return 1
111
+ except Exception:
112
+ logger.error('An unexpected error occurred', exc_info=True)
113
+ return 1
114
+
115
+
116
+ if __name__ == '__main__':
117
+ sys.exit(main())