plainhtml 0.2.2__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. plainhtml-0.3.0/.github/workflows/main.yml +23 -0
  2. plainhtml-0.3.0/.gitignore +64 -0
  3. plainhtml-0.3.0/PKG-INFO +32 -0
  4. {plainhtml-0.2.2 → plainhtml-0.3.0}/README.md +7 -1
  5. plainhtml-0.3.0/pyproject.toml +23 -0
  6. plainhtml-0.3.0/tests/test_parser.py +107 -0
  7. plainhtml-0.3.0/tests/testcases/A Light in the Attic | Books to Scrape - Sandbox.html +361 -0
  8. plainhtml-0.3.0/tests/testcases/A Light in the Attic | Books to Scrape - Sandbox.txt +30 -0
  9. plainhtml-0.3.0/tests/testcases/IANA /342/200/224 IANA-managed Reserved Domains.html" +233 -0
  10. plainhtml-0.3.0/tests/testcases/IANA /342/200/224 IANA-managed Reserved Domains.txt" +105 -0
  11. plainhtml-0.3.0/tests/testcases/Scrapinghub Enterprise Solutions.html +3 -0
  12. plainhtml-0.3.0/tests/testcases/Scrapinghub Enterprise Solutions.txt +230 -0
  13. plainhtml-0.3.0/tests/testcases/Tutorial /342/200/224 Webstruct 0.6 documentation.html" +590 -0
  14. plainhtml-0.3.0/tests/testcases/Tutorial /342/200/224 Webstruct 0.6 documentation.txt" +214 -0
  15. plainhtml-0.3.0/tests/testcases/Webstruct /342/200/224 Webstruct 0.6 documentation.html" +357 -0
  16. plainhtml-0.3.0/tests/testcases/Webstruct /342/200/224 Webstruct 0.6 documentation.txt" +91 -0
  17. plainhtml-0.3.0/uv.lock +324 -0
  18. plainhtml-0.2.2/PKG-INFO +0 -31
  19. plainhtml-0.2.2/pyproject.toml +0 -21
  20. {plainhtml-0.2.2 → plainhtml-0.3.0}/LICENSE +0 -0
  21. {plainhtml-0.2.2 → plainhtml-0.3.0}/plainhtml/__init__.py +0 -0
  22. {plainhtml-0.2.2 → plainhtml-0.3.0}/plainhtml/core.py +0 -0
  23. {plainhtml-0.2.2 → plainhtml-0.3.0}/plainhtml/parser.py +0 -0
  24. {plainhtml-0.2.2 → plainhtml-0.3.0}/plainhtml/utils.py +0 -0
@@ -0,0 +1,23 @@
1
+ name: CI & CD
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - master
7
+
8
+ jobs:
9
+ build:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Check out repository
14
+ uses: actions/checkout@v6
15
+
16
+ - name: Install uv
17
+ uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57
18
+
19
+ - name: Build wheels
20
+ run: uv build
21
+
22
+ - name: Publish wheels
23
+ run: uv publish --token ${{ secrets.PYPI_TOKEN }}
@@ -0,0 +1,64 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ env/
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+
27
+ # PyInstaller
28
+ # Usually these files are written by a python script from a template
29
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
30
+ *.manifest
31
+ *.spec
32
+
33
+ # Installer logs
34
+ pip-log.txt
35
+ pip-delete-this-directory.txt
36
+
37
+ # Unit test / coverage reports
38
+ htmlcov/
39
+ .tox/
40
+ .coverage
41
+ .coverage.*
42
+ .cache
43
+ nosetests.xml
44
+ coverage.xml
45
+ *,cover
46
+ .hypothesis/
47
+ .pytest_cache
48
+
49
+ # Translations
50
+ *.mo
51
+ *.pot
52
+
53
+ # Django stuff:
54
+ *.log
55
+
56
+ # Sphinx documentation
57
+ docs/_build/
58
+
59
+ # PyBuilder
60
+ target/
61
+
62
+ .env
63
+ .vscode
64
+ .venv
@@ -0,0 +1,32 @@
1
+ Metadata-Version: 2.4
2
+ Name: plainhtml
3
+ Version: 0.3.0
4
+ Summary: Extract plain text from HTML
5
+ Author-email: Severin Simmler <s.simmler@snapaddy.com>
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.10
8
+ Requires-Dist: lxml[html-clean]==6.0.4
9
+ Description-Content-Type: text/markdown
10
+
11
+ # Extract plain text from HTML
12
+
13
+ ## Installation
14
+
15
+ ```
16
+ $ uv add plainhtml
17
+ ```
18
+
19
+ For development in this repository:
20
+
21
+ ```
22
+ $ uv sync --dev
23
+ ```
24
+
25
+ ## Example
26
+
27
+ ```python
28
+ >>> import plainhtml
29
+ >>> html = "<html><body><p>foo</p><p>bar</p></body></html>"
30
+ >>> plainhtml.extract_text(html)
31
+ 'foo\n\nbar'
32
+ ```
@@ -3,7 +3,13 @@
3
3
  ## Installation
4
4
 
5
5
  ```
6
- $ pip install plainhtml
6
+ $ uv add plainhtml
7
+ ```
8
+
9
+ For development in this repository:
10
+
11
+ ```
12
+ $ uv sync --dev
7
13
  ```
8
14
 
9
15
  ## Example
@@ -0,0 +1,23 @@
1
+ [project]
2
+ name = "plainhtml"
3
+ version = "0.3.0"
4
+ description = "Extract plain text from HTML"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ authors = [{ name = "Severin Simmler", email = "s.simmler@snapaddy.com" }]
8
+ dependencies = [
9
+ "lxml[html_clean]==6.0.4",
10
+ ]
11
+
12
+ [dependency-groups]
13
+ dev = [
14
+ "ruff>=0.2.1",
15
+ "pytest>=8.0.0",
16
+ ]
17
+
18
+ [tool.ruff.lint.per-file-ignores]
19
+ "__init__.py" = ["F401"]
20
+
21
+ [build-system]
22
+ requires = ["hatchling>=1.24.0"]
23
+ build-backend = "hatchling.build"
@@ -0,0 +1,107 @@
1
+ import plainhtml
2
+
3
+
4
+ def test_extract_no_text_html():
5
+ html = (
6
+ '<!DOCTYPE html><html><body><p><video width="320" height="240" '
7
+ 'controls><source src="movie.mp4" type="video/mp4"><source '
8
+ 'src="movie.ogg" type="video/ogg"></video></p></body></html>'
9
+ )
10
+ assert plainhtml.extract(html) == ""
11
+
12
+
13
+ def test_extract():
14
+ html = "<html><style>.div {}</style>" "<body><p>Hello, world!</body></html>"
15
+ assert plainhtml.extract(html) == "Hello, world!"
16
+
17
+
18
+ def test_declared_encoding():
19
+ html = (
20
+ '<?xml version="1.0" encoding="utf-8" ?>'
21
+ "<html><style>.div {}</style>"
22
+ "<body>Hello, world!</p></body></html>"
23
+ )
24
+ assert plainhtml.extract(html) == "Hello, world!"
25
+
26
+
27
+ def test_empty():
28
+ assert plainhtml.extract("") == ""
29
+ assert plainhtml.extract(" ") == ""
30
+ assert plainhtml.extract(None) == ""
31
+
32
+
33
+ def test_inline_tags_whitespace():
34
+ html = "<span>field</span><span>value of</span><span></span>"
35
+ assert plainhtml.extract(html) == "field value of"
36
+
37
+
38
+ def test_nbsp():
39
+ html = "<h1>Foo&nbsp;Bar</h1>"
40
+ assert plainhtml.extract(html) == "Foo Bar"
41
+
42
+
43
+ def test_adjust_newline():
44
+ html = "<div>text 1</div><p><div>text 2</div></p>"
45
+ assert plainhtml.extract(html) == "text 1\n\ntext 2"
46
+
47
+
48
+ """
49
+
50
+ def test_punct_whitespace():
51
+ html = '<div><span>field</span>, and more</div>'
52
+ assert plainhtml.extract(html) == "nice"
53
+
54
+
55
+
56
+ def test_punct_whitespace_preserved():
57
+ html = (u'<div><span>по</span><span>ле</span>, and , '
58
+ u'<span>more </span>!<span>now</div>a (<b>boo</b>)')
59
+ text = plainhtml.extract(html)
60
+ assert text == u'по ле, and , more ! now a (boo)'
61
+
62
+
63
+ def test_guess_layout():
64
+ html = (u'<title> title </title><div>text_1.<p>text_2 text_3</p>'
65
+ '<p id="demo"></p><ul><li>text_4</li><li>text_5</li></ul>'
66
+ '<p>text_6<em>text_7</em>text_8</p>text_9</div>'
67
+ '<script>document.getElementById("demo").innerHTML = '
68
+ '"This should be skipped";</script> <p>...text_10</p>')
69
+
70
+ text = 'title text_1. text_2 text_3 text_4 text_5 text_6 text_7 ' \
71
+ 'text_8 text_9 ...text_10'
72
+ assert plainhtml.extract(html, guess_punct_space=False, guess_layout=False) == text
73
+
74
+ text = ('title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5'
75
+ '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10')
76
+ assert plainhtml.extract(html, guess_punct_space=False, guess_layout=True) == text
77
+
78
+ text = 'title text_1. text_2 text_3 text_4 text_5 text_6 text_7 ' \
79
+ 'text_8 text_9...text_10'
80
+ assert plainhtml.extract(html, guess_punct_space=True, guess_layout=False) == text
81
+
82
+ text = 'title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5\n\n' \
83
+ 'text_6 text_7 text_8\n\ntext_9\n\n...text_10'
84
+ assert plainhtml.extract(html, guess_punct_space=True, guess_layout=True) == text
85
+
86
+
87
+ def test_basic_newline():
88
+ html = u'<div>a</div><div>b</div>'
89
+ assert plainhtml.extract(html, guess_punct_space=False, guess_layout=False) == 'a b'
90
+ assert plainhtml.extract(html, guess_punct_space=False, guess_layout=True) == 'a\nb'
91
+ assert plainhtml.extract(html, guess_punct_space=True, guess_layout=False) == 'a b'
92
+ assert plainhtml.extract(html, guess_punct_space=True, guess_layout=True) == 'a\nb'
93
+
94
+
95
+ def test_personalize_newlines_sets():
96
+ html = (u'<span><span>text<a>more</a>'
97
+ '</span>and more text <a> and some more</a> <a></a> </span>')
98
+
99
+ text = plainhtml.extract(html, guess_layout=True,
100
+ newline_tags=NEWLINE_TAGS | {'a'})
101
+ assert text == 'text\nmore\nand more text\nand some more'
102
+
103
+ text = plainhtml.extract(html, guess_layout=True,
104
+ double_newline_tags=DOUBLE_NEWLINE_TAGS | {'a'})
105
+ assert text == 'text\n\nmore\n\nand more text\n\nand some more'
106
+
107
+ """
@@ -0,0 +1,361 @@
1
+
2
+
3
+ <!DOCTYPE html>
4
+ <!--[if lt IE 7]> <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
5
+ <!--[if IE 7]> <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->
6
+ <!--[if IE 8]> <html lang="en-us" class="no-js lt-ie9"> <![endif]-->
7
+ <!--[if gt IE 8]><!--> <html lang="en-us" class="no-js"> <!--<![endif]-->
8
+ <head>
9
+ <title>
10
+ A Light in the Attic | Books to Scrape - Sandbox
11
+ </title>
12
+
13
+ <meta http-equiv="content-type" content="text/html; charset=UTF-8" />
14
+ <meta name="created" content="24th Jun 2016 09:29" />
15
+ <meta name="description" content="
16
+ It&#39;s hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein&#39;s humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love th It&#39;s hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein&#39;s humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love that Silverstein. Need proof of his genius? RockabyeRockabye baby, in the treetopDon&#39;t you know a treetopIs no safe place to rock?And who put you up there,And your cradle, too?Baby, I think someone down here&#39;sGot it in for you. Shel, you never sounded so good. ...more
17
+ " />
18
+ <meta name="viewport" content="width=device-width" />
19
+ <meta name="robots" content="NOARCHIVE,NOCACHE" />
20
+
21
+ <!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
22
+ <!--[if lt IE 9]>
23
+ <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
24
+ <![endif]-->
25
+
26
+
27
+ <link rel="shortcut icon" href="../../static/oscar/favicon.ico" />
28
+
29
+
30
+
31
+
32
+
33
+
34
+ <link rel="stylesheet" type="text/css" href="../../static/oscar/css/styles.css" />
35
+
36
+ <link rel="stylesheet" href="../../static/oscar/js/bootstrap-datetimepicker/bootstrap-datetimepicker.css" />
37
+ <link rel="stylesheet" type="text/css" href="../../static/oscar/css/datetimepicker.css" />
38
+
39
+
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+
48
+
49
+
50
+ </head>
51
+
52
+ <body id="default" class="default">
53
+
54
+
55
+
56
+
57
+ <header class="header container-fluid">
58
+ <div class="page_inner">
59
+ <div class="row">
60
+ <div class="col-sm-8 h1"><a href="../../index.html">Books to Scrape</a><small> We love being scraped!</small>
61
+ </div>
62
+
63
+
64
+ </div>
65
+ </div>
66
+ </header>
67
+
68
+
69
+
70
+ <div class="container-fluid page">
71
+ <div class="page_inner">
72
+
73
+ <ul class="breadcrumb">
74
+ <li>
75
+ <a href="../../index.html">Home</a>
76
+ </li>
77
+
78
+
79
+ <li>
80
+ <a href="../category/books_1/index.html">Books</a>
81
+ </li>
82
+
83
+ <li>
84
+ <a href="../category/books/poetry_23/index.html">Poetry</a>
85
+ </li>
86
+
87
+ <li class="active">A Light in the Attic</li>
88
+
89
+
90
+
91
+
92
+ </ul>
93
+
94
+
95
+
96
+
97
+
98
+
99
+
100
+ <div id="messages">
101
+
102
+ </div>
103
+
104
+
105
+ <div class="content">
106
+
107
+
108
+
109
+ <div id="promotions">
110
+
111
+ </div>
112
+
113
+
114
+ <div id="content_inner">
115
+
116
+ <article class="product_page"><!-- Start of product page -->
117
+
118
+ <div class="row">
119
+
120
+
121
+ <div class="col-sm-6">
122
+
123
+
124
+
125
+
126
+
127
+
128
+
129
+
130
+
131
+
132
+ <div id="product_gallery" class="carousel">
133
+ <div class="thumbnail">
134
+ <div class="carousel-inner">
135
+ <div class="item active">
136
+
137
+
138
+ <img src="../../media/cache/fe/72/fe72f0532301ec28892ae79a629a293c.jpg" alt="A Light in the Attic" />
139
+
140
+
141
+ </div>
142
+ </div>
143
+ </div>
144
+ </div>
145
+
146
+
147
+
148
+
149
+ </div>
150
+
151
+
152
+
153
+ <div class="col-sm-6 product_main">
154
+
155
+
156
+ <h1>A Light in the Attic</h1>
157
+
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+
166
+
167
+ <p class="price_color">£51.77</p>
168
+
169
+
170
+ <p class="instock availability">
171
+ <i class="icon-ok"></i>
172
+
173
+ In stock (22 available)
174
+
175
+ </p>
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+ <p class="star-rating Three">
185
+ <i class="icon-star"></i>
186
+ <i class="icon-star"></i>
187
+ <i class="icon-star"></i>
188
+ <i class="icon-star"></i>
189
+ <i class="icon-star"></i>
190
+
191
+ <!-- <small><a href="/catalogue/a-light-in-the-attic_1000/reviews/">
192
+
193
+
194
+ 0 customer reviews
195
+
196
+ </a></small>
197
+ -->&nbsp;
198
+
199
+
200
+ <!--
201
+ <a id="write_review" href="/catalogue/a-light-in-the-attic_1000/reviews/add/#addreview" class="btn btn-success btn-sm">
202
+ Write a review
203
+ </a>
204
+
205
+ --></p>
206
+
207
+
208
+
209
+ <hr/>
210
+
211
+ <div class="alert alert-warning" role="alert"><strong>Warning!</strong> This is a demo website for web scraping purposes. Prices and ratings here were randomly assigned and have no real meaning.</div>
212
+
213
+
214
+
215
+
216
+
217
+
218
+
219
+
220
+
221
+
222
+
223
+ </div><!-- /col-sm-6 -->
224
+
225
+
226
+ </div><!-- /row -->
227
+
228
+
229
+
230
+ <div id="product_description" class="sub-header">
231
+ <h2>Product Description</h2>
232
+ </div>
233
+ <p>It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love th It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love that Silverstein. Need proof of his genius? RockabyeRockabye baby, in the treetopDon't you know a treetopIs no safe place to rock?And who put you up there,And your cradle, too?Baby, I think someone down here'sGot it in for you. Shel, you never sounded so good. ...more</p>
234
+
235
+
236
+
237
+
238
+ <div class="sub-header">
239
+ <h2>Product Information</h2>
240
+ </div>
241
+ <table class="table table-striped">
242
+
243
+ <tr>
244
+ <th>UPC</th><td>a897fe39b1053632</td>
245
+ </tr>
246
+
247
+ <tr>
248
+ <th>Product Type</th><td>Books</td>
249
+ </tr>
250
+
251
+
252
+
253
+ <tr>
254
+ <th>Price (excl. tax)</th><td>£51.77</td>
255
+ </tr>
256
+
257
+ <tr>
258
+ <th>Price (incl. tax)</th><td>£51.77</td>
259
+ </tr>
260
+ <tr>
261
+ <th>Tax</th><td>£0.00</td>
262
+ </tr>
263
+
264
+ <tr>
265
+ <th>Availability</th>
266
+ <td>In stock (22 available)</td>
267
+ </tr>
268
+
269
+
270
+
271
+ <tr>
272
+ <th>Number of reviews</th>
273
+ <td>0</td>
274
+ </tr>
275
+
276
+ </table>
277
+
278
+
279
+
280
+
281
+ <section>
282
+ <div id="reviews" class="sub-header">
283
+ </div>
284
+ </section>
285
+
286
+
287
+
288
+
289
+
290
+
291
+
292
+
293
+
294
+
295
+
296
+
297
+
298
+
299
+
300
+ </article><!-- End of product page -->
301
+ </div>
302
+ </div>
303
+ </div>
304
+ </div>
305
+
306
+
307
+
308
+ <footer class="footer container-fluid">
309
+
310
+
311
+
312
+ </footer>
313
+
314
+
315
+
316
+
317
+
318
+ <!-- jQuery -->
319
+ <script src="http://ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js"></script>
320
+ <script>window.jQuery || document.write('<script src="../../static/oscar/js/jquery/jquery-1.9.1.min.js"><\/script>')</script>
321
+
322
+
323
+
324
+
325
+
326
+
327
+
328
+
329
+ <!-- Twitter Bootstrap -->
330
+ <script type="text/javascript" src="../../static/oscar/js/bootstrap3/bootstrap.min.js"></script>
331
+ <!-- Oscar -->
332
+ <script src="../../static/oscar/js/oscar/ui.js" type="text/javascript" charset="utf-8"></script>
333
+
334
+ <script src="../../static/oscar/js/bootstrap-datetimepicker/bootstrap-datetimepicker.js" type="text/javascript" charset="utf-8"></script>
335
+ <script src="../../static/oscar/js/bootstrap-datetimepicker/locales/bootstrap-datetimepicker.all.js" type="text/javascript" charset="utf-8"></script>
336
+
337
+
338
+
339
+
340
+
341
+
342
+
343
+
344
+
345
+
346
+
347
+
348
+ <script type="text/javascript">
349
+ $(function() {
350
+
351
+
352
+ oscar.init();
353
+
354
+ });
355
+ </script>
356
+
357
+
358
+ <!-- Version: N/A -->
359
+
360
+ </body>
361
+ </html>
@@ -0,0 +1,30 @@
1
+ A Light in the Attic | Books to Scrape - Sandbox
2
+
3
+ Books to Scrape We love being scraped!
4
+
5
+ Home
6
+ Books
7
+ Poetry
8
+ A Light in the Attic
9
+
10
+ A Light in the Attic
11
+
12
+ £51.77
13
+
14
+ In stock (22 available)
15
+
16
+ Warning! This is a demo website for web scraping purposes. Prices and ratings here were randomly assigned and have no real meaning.
17
+
18
+ Product Description
19
+
20
+ It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love th It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love that Silverstein. Need proof of his genius? RockabyeRockabye baby, in the treetopDon't you know a treetopIs no safe place to rock?And who put you up there,And your cradle, too?Baby, I think someone down here'sGot it in for you. Shel, you never sounded so good. ...more
21
+
22
+ Product Information
23
+
24
+ UPC a897fe39b1053632
25
+ Product Type Books
26
+ Price (excl. tax) £51.77
27
+ Price (incl. tax) £51.77
28
+ Tax £0.00
29
+ Availability In stock (22 available)
30
+ Number of reviews 0