daz-web-extract 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- daz_web_extract-0.2.0/PKG-INFO +186 -0
- daz_web_extract-0.2.0/README.md +164 -0
- daz_web_extract-0.2.0/pyproject.toml +46 -0
- daz_web_extract-0.2.0/setup.cfg +4 -0
- daz_web_extract-0.2.0/src/daz_web_extract/__init__.py +4 -0
- daz_web_extract-0.2.0/src/daz_web_extract/content.py +134 -0
- daz_web_extract-0.2.0/src/daz_web_extract/content_test.py +1642 -0
- daz_web_extract-0.2.0/src/daz_web_extract/extract.py +65 -0
- daz_web_extract-0.2.0/src/daz_web_extract/extract_test.py +82 -0
- daz_web_extract-0.2.0/src/daz_web_extract/fetch_http.py +85 -0
- daz_web_extract-0.2.0/src/daz_web_extract/fetch_http_test.py +46 -0
- daz_web_extract-0.2.0/src/daz_web_extract/fetch_playwright.py +114 -0
- daz_web_extract-0.2.0/src/daz_web_extract/fetch_playwright_test.py +32 -0
- daz_web_extract-0.2.0/src/daz_web_extract/fetch_trafilatura.py +92 -0
- daz_web_extract-0.2.0/src/daz_web_extract/fetch_trafilatura_test.py +29 -0
- daz_web_extract-0.2.0/src/daz_web_extract/result.py +80 -0
- daz_web_extract-0.2.0/src/daz_web_extract/result_test.py +126 -0
- daz_web_extract-0.2.0/src/daz_web_extract.egg-info/PKG-INFO +186 -0
- daz_web_extract-0.2.0/src/daz_web_extract.egg-info/SOURCES.txt +20 -0
- daz_web_extract-0.2.0/src/daz_web_extract.egg-info/dependency_links.txt +1 -0
- daz_web_extract-0.2.0/src/daz_web_extract.egg-info/requires.txt +10 -0
- daz_web_extract-0.2.0/src/daz_web_extract.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: daz-web-extract
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Async web content extraction library with three-tier fetch strategy
|
|
5
|
+
Author-email: Darren Oakey <darren@oakey.net>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/darrenoakey/daz-web-extract
|
|
8
|
+
Project-URL: Repository, https://github.com/darrenoakey/daz-web-extract
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.12
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: httpx>=0.28.0
|
|
14
|
+
Requires-Dist: lxml>=6.0.0
|
|
15
|
+
Requires-Dist: trafilatura>=1.6.0
|
|
16
|
+
Requires-Dist: playwright>=1.40.0
|
|
17
|
+
Requires-Dist: setproctitle>=1.3.0
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
20
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
21
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
22
|
+
|
|
23
|
+

|
|
24
|
+
|
|
25
|
+
# daz-web-extract
|
|
26
|
+
|
|
27
|
+
Async Python library that extracts clean title and body text from any URL. It automatically escalates through multiple fetch strategies to handle everything from simple static pages to JavaScript-rendered content. It never throws exceptions — every call returns a structured result indicating success or failure.
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
Requires Python 3.12+.
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install daz-web-extract
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
After installing, set up the browser engine for pages that require JavaScript rendering:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
playwright install chromium
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Usage
|
|
44
|
+
|
|
45
|
+
### Python API
|
|
46
|
+
|
|
47
|
+
The library exposes a single async function `extract` and a result type `ExtractionResult`.
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
import asyncio
|
|
51
|
+
from daz_web_extract import extract, ExtractionResult
|
|
52
|
+
|
|
53
|
+
result: ExtractionResult = asyncio.run(extract("https://example.com"))
|
|
54
|
+
|
|
55
|
+
if result.success:
|
|
56
|
+
print(result.title) # Page title
|
|
57
|
+
print(result.body) # Clean body text
|
|
58
|
+
print(result.fetch_method) # Which strategy succeeded
|
|
59
|
+
print(result.content_length) # Length of body in characters
|
|
60
|
+
print(result.elapsed_ms) # Total time in milliseconds
|
|
61
|
+
print(result.status_code) # HTTP status code (if available)
|
|
62
|
+
else:
|
|
63
|
+
print(result.error) # Human-readable error message
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
#### Limiting fetch strategies
|
|
67
|
+
|
|
68
|
+
Use the `max_tier` parameter to control how far the library escalates:
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
# Only use fast HTTP fetch (no browser, no trafilatura)
|
|
72
|
+
result = await extract("https://example.com", max_tier=1)
|
|
73
|
+
|
|
74
|
+
# Use HTTP fetch + trafilatura, but skip the browser
|
|
75
|
+
result = await extract("https://example.com", max_tier=2)
|
|
76
|
+
|
|
77
|
+
# Use all strategies including headless browser (default)
|
|
78
|
+
result = await extract("https://example.com", max_tier=3)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
#### Serialization
|
|
82
|
+
|
|
83
|
+
Results can be converted to dictionaries or JSON:
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
result.to_dict() # Returns a plain dict
|
|
87
|
+
result.to_json() # Returns a JSON string
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
#### Using in async code
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
import asyncio
|
|
94
|
+
from daz_web_extract import extract
|
|
95
|
+
|
|
96
|
+
async def main():
|
|
97
|
+
urls = [
|
|
98
|
+
"https://example.com",
|
|
99
|
+
"https://www.iana.org/help/example-domains",
|
|
100
|
+
]
|
|
101
|
+
results = await asyncio.gather(*[extract(url) for url in urls])
|
|
102
|
+
for r in results:
|
|
103
|
+
print(f"{r.url}: {r.title} ({r.content_length} chars)")
|
|
104
|
+
|
|
105
|
+
asyncio.run(main())
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### Command Line
|
|
109
|
+
|
|
110
|
+
Extract content from a URL and print the result:
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
python run_cli.py extract https://example.com
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Output:
|
|
117
|
+
|
|
118
|
+
```
|
|
119
|
+
Title: Example Domain
|
|
120
|
+
Method: httpx
|
|
121
|
+
Length: 217 chars
|
|
122
|
+
Time: 142ms
|
|
123
|
+
|
|
124
|
+
Example Domain
|
|
125
|
+
This domain is for use in illustrative examples in documents. You may use this domain
|
|
126
|
+
in literature without prior coordination or asking for permission.
|
|
127
|
+
More information...
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Get raw JSON output:
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
python run_cli.py extract https://example.com --raw
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Output:
|
|
137
|
+
|
|
138
|
+
```json
|
|
139
|
+
{
|
|
140
|
+
"success": true,
|
|
141
|
+
"url": "https://example.com",
|
|
142
|
+
"title": "Example Domain",
|
|
143
|
+
"body": "Example Domain\nThis domain is for use in ...",
|
|
144
|
+
"error": null,
|
|
145
|
+
"fetch_method": "httpx",
|
|
146
|
+
"status_code": 200,
|
|
147
|
+
"content_length": 217,
|
|
148
|
+
"elapsed_ms": 142
|
|
149
|
+
}
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
### Using via the `run` script
|
|
153
|
+
|
|
154
|
+
The project includes a `run` script that automatically activates the virtual environment:
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
# Extract content
|
|
158
|
+
./run extract https://example.com
|
|
159
|
+
./run extract https://example.com --raw
|
|
160
|
+
|
|
161
|
+
# Run tests
|
|
162
|
+
./run test src/daz_web_extract/result_test.py
|
|
163
|
+
|
|
164
|
+
# Run linter
|
|
165
|
+
./run lint
|
|
166
|
+
|
|
167
|
+
# Run full quality checks
|
|
168
|
+
./run check
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## Development
|
|
172
|
+
|
|
173
|
+
Set up a development environment:
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
python -m venv .venv
|
|
177
|
+
source .venv/bin/activate
|
|
178
|
+
pip install -e ".[dev]"
|
|
179
|
+
playwright install chromium
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
Run the tests:
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
pytest -q src/
|
|
186
|
+
```
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+

|
|
2
|
+
|
|
3
|
+
# daz-web-extract
|
|
4
|
+
|
|
5
|
+
Async Python library that extracts clean title and body text from any URL. It automatically escalates through multiple fetch strategies to handle everything from simple static pages to JavaScript-rendered content. It never throws exceptions — every call returns a structured result indicating success or failure.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
Requires Python 3.12+.
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install daz-web-extract
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
After installing, set up the browser engine for pages that require JavaScript rendering:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
playwright install chromium
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Usage
|
|
22
|
+
|
|
23
|
+
### Python API
|
|
24
|
+
|
|
25
|
+
The library exposes a single async function `extract` and a result type `ExtractionResult`.
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
import asyncio
|
|
29
|
+
from daz_web_extract import extract, ExtractionResult
|
|
30
|
+
|
|
31
|
+
result: ExtractionResult = asyncio.run(extract("https://example.com"))
|
|
32
|
+
|
|
33
|
+
if result.success:
|
|
34
|
+
print(result.title) # Page title
|
|
35
|
+
print(result.body) # Clean body text
|
|
36
|
+
print(result.fetch_method) # Which strategy succeeded
|
|
37
|
+
print(result.content_length) # Length of body in characters
|
|
38
|
+
print(result.elapsed_ms) # Total time in milliseconds
|
|
39
|
+
print(result.status_code) # HTTP status code (if available)
|
|
40
|
+
else:
|
|
41
|
+
print(result.error) # Human-readable error message
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
#### Limiting fetch strategies
|
|
45
|
+
|
|
46
|
+
Use the `max_tier` parameter to control how far the library escalates:
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
# Only use fast HTTP fetch (no browser, no trafilatura)
|
|
50
|
+
result = await extract("https://example.com", max_tier=1)
|
|
51
|
+
|
|
52
|
+
# Use HTTP fetch + trafilatura, but skip the browser
|
|
53
|
+
result = await extract("https://example.com", max_tier=2)
|
|
54
|
+
|
|
55
|
+
# Use all strategies including headless browser (default)
|
|
56
|
+
result = await extract("https://example.com", max_tier=3)
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
#### Serialization
|
|
60
|
+
|
|
61
|
+
Results can be converted to dictionaries or JSON:
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
result.to_dict() # Returns a plain dict
|
|
65
|
+
result.to_json() # Returns a JSON string
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
#### Using in async code
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
import asyncio
|
|
72
|
+
from daz_web_extract import extract
|
|
73
|
+
|
|
74
|
+
async def main():
|
|
75
|
+
urls = [
|
|
76
|
+
"https://example.com",
|
|
77
|
+
"https://www.iana.org/help/example-domains",
|
|
78
|
+
]
|
|
79
|
+
results = await asyncio.gather(*[extract(url) for url in urls])
|
|
80
|
+
for r in results:
|
|
81
|
+
print(f"{r.url}: {r.title} ({r.content_length} chars)")
|
|
82
|
+
|
|
83
|
+
asyncio.run(main())
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Command Line
|
|
87
|
+
|
|
88
|
+
Extract content from a URL and print the result:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
python run_cli.py extract https://example.com
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Output:
|
|
95
|
+
|
|
96
|
+
```
|
|
97
|
+
Title: Example Domain
|
|
98
|
+
Method: httpx
|
|
99
|
+
Length: 217 chars
|
|
100
|
+
Time: 142ms
|
|
101
|
+
|
|
102
|
+
Example Domain
|
|
103
|
+
This domain is for use in illustrative examples in documents. You may use this domain
|
|
104
|
+
in literature without prior coordination or asking for permission.
|
|
105
|
+
More information...
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Get raw JSON output:
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
python run_cli.py extract https://example.com --raw
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Output:
|
|
115
|
+
|
|
116
|
+
```json
|
|
117
|
+
{
|
|
118
|
+
"success": true,
|
|
119
|
+
"url": "https://example.com",
|
|
120
|
+
"title": "Example Domain",
|
|
121
|
+
"body": "Example Domain\nThis domain is for use in ...",
|
|
122
|
+
"error": null,
|
|
123
|
+
"fetch_method": "httpx",
|
|
124
|
+
"status_code": 200,
|
|
125
|
+
"content_length": 217,
|
|
126
|
+
"elapsed_ms": 142
|
|
127
|
+
}
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Using via the `run` script
|
|
131
|
+
|
|
132
|
+
The project includes a `run` script that automatically activates the virtual environment:
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
# Extract content
|
|
136
|
+
./run extract https://example.com
|
|
137
|
+
./run extract https://example.com --raw
|
|
138
|
+
|
|
139
|
+
# Run tests
|
|
140
|
+
./run test src/daz_web_extract/result_test.py
|
|
141
|
+
|
|
142
|
+
# Run linter
|
|
143
|
+
./run lint
|
|
144
|
+
|
|
145
|
+
# Run full quality checks
|
|
146
|
+
./run check
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## Development
|
|
150
|
+
|
|
151
|
+
Set up a development environment:
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
python -m venv .venv
|
|
155
|
+
source .venv/bin/activate
|
|
156
|
+
pip install -e ".[dev]"
|
|
157
|
+
playwright install chromium
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Run the tests:
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
pytest -q src/
|
|
164
|
+
```
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "daz-web-extract"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Async web content extraction library with three-tier fetch strategy"
|
|
9
|
+
requires-python = ">=3.12"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
authors = [
|
|
12
|
+
{name = "Darren Oakey", email = "darren@oakey.net"},
|
|
13
|
+
]
|
|
14
|
+
readme = "README.md"
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Operating System :: OS Independent",
|
|
18
|
+
]
|
|
19
|
+
dependencies = [
|
|
20
|
+
"httpx>=0.28.0",
|
|
21
|
+
"lxml>=6.0.0",
|
|
22
|
+
"trafilatura>=1.6.0",
|
|
23
|
+
"playwright>=1.40.0",
|
|
24
|
+
"setproctitle>=1.3.0",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.urls]
|
|
28
|
+
Homepage = "https://github.com/darrenoakey/daz-web-extract"
|
|
29
|
+
Repository = "https://github.com/darrenoakey/daz-web-extract"
|
|
30
|
+
|
|
31
|
+
[project.optional-dependencies]
|
|
32
|
+
dev = [
|
|
33
|
+
"pytest>=7.4.0",
|
|
34
|
+
"pytest-asyncio>=0.21.0",
|
|
35
|
+
"ruff>=0.1.0",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
[tool.setuptools.packages.find]
|
|
39
|
+
where = ["src"]
|
|
40
|
+
|
|
41
|
+
[tool.pytest.ini_options]
|
|
42
|
+
asyncio_mode = "auto"
|
|
43
|
+
testpaths = ["src"]
|
|
44
|
+
|
|
45
|
+
[tool.ruff]
|
|
46
|
+
line-length = 120
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
import lxml.html
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
NOISE_TAGS = {
|
|
9
|
+
"script", "style", "nav", "footer", "aside", "header", "noscript",
|
|
10
|
+
"iframe", "form", "svg", "button", "select", "option", "textarea",
|
|
11
|
+
"input", "label", "fieldset", "legend", "dialog", "menu", "menuitem",
|
|
12
|
+
"details", "summary",
|
|
13
|
+
}
|
|
14
|
+
NOISE_CLASSES = {
|
|
15
|
+
"ad", "ads", "advert", "advertisement", "banner", "sponsor", "sponsored",
|
|
16
|
+
"promo", "promotion", "sidebar", "widget", "social", "share", "sharing",
|
|
17
|
+
"cookie", "consent", "popup", "modal", "overlay", "newsletter",
|
|
18
|
+
"subscribe", "signup", "sign-up", "cta", "call-to-action",
|
|
19
|
+
"related", "recommended", "trending", "popular", "breadcrumb",
|
|
20
|
+
"pagination", "pager", "toolbar", "tooltip", "dropdown",
|
|
21
|
+
"comment", "comments", "disqus",
|
|
22
|
+
}
|
|
23
|
+
NOISE_IDS = {
|
|
24
|
+
"ad", "ads", "sidebar", "cookie-banner", "newsletter",
|
|
25
|
+
"comments", "disqus_thread", "social-share",
|
|
26
|
+
}
|
|
27
|
+
NOISE_ROLES = {"navigation", "banner", "complementary", "contentinfo", "form", "search", "menu", "menubar"}
|
|
28
|
+
CONTENT_TAGS = {"p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "blockquote", "td", "th", "figcaption", "pre", "dd"}
|
|
29
|
+
MIN_BLOCK_LENGTH = 15
|
|
30
|
+
MIN_BODY_LENGTH = 100
|
|
31
|
+
TITLE_SUFFIX_RE = re.compile(r"\s*[\|\-\u2013\u2014]\s*[^|\-\u2013\u2014]+$")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# ##################################################################
|
|
35
|
+
# parse html
|
|
36
|
+
# convert raw html bytes or string into an lxml element tree
|
|
37
|
+
def parse_html(html: str | bytes) -> lxml.html.HtmlElement:
|
|
38
|
+
if isinstance(html, bytes):
|
|
39
|
+
html = html.decode("utf-8", errors="replace")
|
|
40
|
+
return lxml.html.fromstring(html)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# ##################################################################
|
|
44
|
+
# extract title
|
|
45
|
+
# pull the best title from html using priority: og:title > <title> > first h1
|
|
46
|
+
def extract_title(tree: lxml.html.HtmlElement) -> str | None:
|
|
47
|
+
og = tree.xpath('//meta[@property="og:title"]/@content')
|
|
48
|
+
if og and og[0].strip():
|
|
49
|
+
return og[0].strip()
|
|
50
|
+
|
|
51
|
+
title_els = tree.xpath("//title/text()")
|
|
52
|
+
if title_els:
|
|
53
|
+
raw = title_els[0].strip()
|
|
54
|
+
if raw:
|
|
55
|
+
return _clean_title_suffix(raw)
|
|
56
|
+
|
|
57
|
+
h1_els = tree.xpath("//h1//text()")
|
|
58
|
+
if h1_els:
|
|
59
|
+
combined = " ".join(t.strip() for t in h1_els if t.strip())
|
|
60
|
+
if combined:
|
|
61
|
+
return combined
|
|
62
|
+
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# ##################################################################
|
|
67
|
+
# clean title suffix
|
|
68
|
+
# remove common site name suffixes like " | SiteName" or " - SiteName"
|
|
69
|
+
def _clean_title_suffix(title: str) -> str:
|
|
70
|
+
cleaned = TITLE_SUFFIX_RE.sub("", title)
|
|
71
|
+
return cleaned if cleaned.strip() else title
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# ##################################################################
|
|
75
|
+
# extract text content
|
|
76
|
+
# pull clean article text from html by collecting text from content
|
|
77
|
+
# tags after removing all noise elements; link text is preserved but
|
|
78
|
+
# all html formatting is stripped to produce plain text
|
|
79
|
+
def extract_text_content(tree: lxml.html.HtmlElement) -> str | None:
|
|
80
|
+
_remove_noise(tree)
|
|
81
|
+
blocks = _collect_blocks(tree)
|
|
82
|
+
filtered = [b for b in blocks if len(b) >= MIN_BLOCK_LENGTH]
|
|
83
|
+
body = "\n\n".join(filtered)
|
|
84
|
+
if len(body) < MIN_BODY_LENGTH:
|
|
85
|
+
return None
|
|
86
|
+
return body
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# ##################################################################
|
|
90
|
+
# remove noise
|
|
91
|
+
# strip script, style, nav, footer, ads, forms, and other noise
|
|
92
|
+
# elements from tree using tags, class names, ids, and aria roles
|
|
93
|
+
def _remove_noise(tree: lxml.html.HtmlElement) -> None:
|
|
94
|
+
to_remove = []
|
|
95
|
+
for el in tree.xpath("//*"):
|
|
96
|
+
if _is_noise_element(el):
|
|
97
|
+
to_remove.append(el)
|
|
98
|
+
for el in to_remove:
|
|
99
|
+
parent = el.getparent()
|
|
100
|
+
if parent is not None:
|
|
101
|
+
parent.remove(el)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# ##################################################################
|
|
105
|
+
# is noise element
|
|
106
|
+
# check whether an element is noise by tag, class, id, or role
|
|
107
|
+
def _is_noise_element(el: lxml.html.HtmlElement) -> bool:
|
|
108
|
+
if el.tag in NOISE_TAGS:
|
|
109
|
+
return True
|
|
110
|
+
classes = set(el.get("class", "").lower().split())
|
|
111
|
+
if classes & NOISE_CLASSES:
|
|
112
|
+
return True
|
|
113
|
+
el_id = el.get("id", "").lower()
|
|
114
|
+
if el_id in NOISE_IDS:
|
|
115
|
+
return True
|
|
116
|
+
role = el.get("role", "").lower()
|
|
117
|
+
if role in NOISE_ROLES:
|
|
118
|
+
return True
|
|
119
|
+
return False
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# ##################################################################
|
|
123
|
+
# collect blocks
|
|
124
|
+
# gather text content from paragraph-like elements; link text within
|
|
125
|
+
# paragraphs is preserved since links are part of article content
|
|
126
|
+
def _collect_blocks(tree: lxml.html.HtmlElement) -> list[str]:
|
|
127
|
+
blocks: list[str] = []
|
|
128
|
+
for el in tree.xpath("//*"):
|
|
129
|
+
if el.tag in CONTENT_TAGS:
|
|
130
|
+
text = el.text_content().strip()
|
|
131
|
+
text = re.sub(r"\s+", " ", text)
|
|
132
|
+
if text:
|
|
133
|
+
blocks.append(text)
|
|
134
|
+
return blocks
|