decant-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
decant/__init__.py ADDED
File without changes
decant/cli/__init__.py ADDED
File without changes
decant/cli/main.py ADDED
@@ -0,0 +1,158 @@
1
+ """
2
+ Decant CLI entry point.
3
+
4
+ Provides convert command for HTML to readable HTML conversion.
5
+ """
6
+ import argparse
7
+ import sys
8
+
9
+ from decant.core.content_selector import detect_mode
10
+ from decant.core.parser import parse, extract_with_trafilatura, ValidationError, harvest_captions
11
+ from decant.core.renderer import render
12
+ from decant.io.reader import read_html
13
+ from decant.io.writer import write_html
14
+
15
+
16
+ def main():
17
+ """
18
+ Main CLI entry point.
19
+
20
+ Exit codes:
21
+ - 0: Success
22
+ - 1: Validation/parse error
23
+ - 2: Render error
24
+ - 3: I/O error
25
+ """
26
+ parser = argparse.ArgumentParser(
27
+ prog='decant',
28
+ description='Convert semantic HTML to accessible readable HTML'
29
+ )
30
+
31
+ parser.add_argument(
32
+ 'input',
33
+ nargs='?',
34
+ help='Input HTML file (omit for stdin)'
35
+ )
36
+
37
+ parser.add_argument(
38
+ '-o', '--output',
39
+ help='Output file path (default: <input>.decant.html or stdout)'
40
+ )
41
+
42
+ parser.add_argument(
43
+ '--font',
44
+ choices=['opendyslexic'],
45
+ help='Use alternative font (opendyslexic)'
46
+ )
47
+
48
+ parser.add_argument(
49
+ '--mode',
50
+ choices=['transform', 'extract', 'auto'],
51
+ default='auto',
52
+ help=(
53
+ 'Processing mode (default: auto). '
54
+ 'transform: fidelity-first, no boilerplate removal. '
55
+ 'extract: boilerplate removal via Trafilatura, formatting best-effort. '
56
+ 'auto: detect mode from input (defaults to transform when ambiguous).'
57
+ )
58
+ )
59
+
60
+ parser.add_argument(
61
+ '--source-url',
62
+ default='',
63
+ help='URL of the original source page (used for "View original" links)'
64
+ )
65
+
66
+ parser.add_argument(
67
+ '--verbose',
68
+ action='store_true',
69
+ help='Print processing decisions to stderr'
70
+ )
71
+
72
+ args = parser.parse_args()
73
+
74
+ # Determine input/output paths
75
+ input_path = args.input
76
+ output_path = args.output
77
+
78
+ # If no output specified and reading from file, use default naming
79
+ if not output_path and input_path:
80
+ if input_path.endswith('.html'):
81
+ output_path = input_path.replace('.html', '.decant.html')
82
+ else:
83
+ output_path = input_path + '.decant.html'
84
+
85
+ # Font flag
86
+ use_opendyslexic = args.font == 'opendyslexic'
87
+
88
+ try:
89
+ # Read input
90
+ html_input = read_html(input_path)
91
+
92
+ # Determine mode
93
+ if args.mode == 'auto':
94
+ mode = detect_mode(html_input)
95
+ if args.verbose:
96
+ print(f"Mode: {mode} (auto-detected)", file=sys.stderr)
97
+ else:
98
+ mode = args.mode
99
+ if args.verbose:
100
+ print(f"Mode: {mode} (explicit)", file=sys.stderr)
101
+
102
+ # Route to appropriate pipeline
103
+ original_title = None
104
+ caption_map = None
105
+ if mode == 'extract':
106
+ # Capture title before Trafilatura strips <head>
107
+ from bs4 import BeautifulSoup
108
+ original_soup = BeautifulSoup(html_input, "lxml")
109
+ original_title = original_soup.find("title")
110
+
111
+ # Harvest captions before Trafilatura strips figcaptions
112
+ caption_map = harvest_captions(html_input)
113
+
114
+ if args.verbose:
115
+ print("Extract mode: running Trafilatura boilerplate removal", file=sys.stderr)
116
+
117
+ html_input = extract_with_trafilatura(html_input)
118
+
119
+ if args.verbose:
120
+ print("Extract mode: boilerplate removal complete", file=sys.stderr)
121
+ else:
122
+ if args.verbose:
123
+ print("Transform mode: fidelity-first parsing, no boilerplate removal", file=sys.stderr)
124
+
125
+ # Parse to model
126
+ document = parse(
127
+ html_input,
128
+ original_title=original_title,
129
+ require_article_body=(mode == "extract"),
130
+ caption_map=caption_map,
131
+ source_url=args.source_url,
132
+ )
133
+
134
+ # Render to readable HTML
135
+ html_output = render(document, use_opendyslexic=use_opendyslexic)
136
+
137
+ # Write output
138
+ write_html(html_output, output_path)
139
+
140
+ # Success
141
+ sys.exit(0)
142
+
143
+ except IOError as e:
144
+ print(f"I/O error: {e}", file=sys.stderr)
145
+ sys.exit(3)
146
+ except ValidationError as e:
147
+ print(str(e), file=sys.stderr)
148
+ sys.exit(1)
149
+ except ValueError as e:
150
+ print(f"Validation error: {e}", file=sys.stderr)
151
+ sys.exit(1)
152
+ except Exception as e:
153
+ print(f"Unexpected error: {e}", file=sys.stderr)
154
+ sys.exit(2)
155
+
156
+
157
+ if __name__ == '__main__':
158
+ main()
File without changes