daz-web-extract 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,186 @@
1
+ Metadata-Version: 2.4
2
+ Name: daz-web-extract
3
+ Version: 0.2.0
4
+ Summary: Async web content extraction library with three-tier fetch strategy
5
+ Author-email: Darren Oakey <darren@oakey.net>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/darrenoakey/daz-web-extract
8
+ Project-URL: Repository, https://github.com/darrenoakey/daz-web-extract
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.12
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: httpx>=0.28.0
14
+ Requires-Dist: lxml>=6.0.0
15
+ Requires-Dist: trafilatura>=1.6.0
16
+ Requires-Dist: playwright>=1.40.0
17
+ Requires-Dist: setproctitle>=1.3.0
18
+ Provides-Extra: dev
19
+ Requires-Dist: pytest>=7.4.0; extra == "dev"
20
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
21
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
22
+
23
+ ![](banner.jpg)
24
+
25
+ # daz-web-extract
26
+
27
+ Async Python library that extracts clean title and body text from any URL. It automatically escalates through multiple fetch strategies to handle everything from simple static pages to JavaScript-rendered content. It never throws exceptions — every call returns a structured result indicating success or failure.
28
+
29
+ ## Installation
30
+
31
+ Requires Python 3.12+.
32
+
33
+ ```bash
34
+ pip install daz-web-extract
35
+ ```
36
+
37
+ After installing, set up the browser engine for pages that require JavaScript rendering:
38
+
39
+ ```bash
40
+ playwright install chromium
41
+ ```
42
+
43
+ ## Usage
44
+
45
+ ### Python API
46
+
47
+ The library exposes a single async function `extract` and a result type `ExtractionResult`.
48
+
49
+ ```python
50
+ import asyncio
51
+ from daz_web_extract import extract, ExtractionResult
52
+
53
+ result: ExtractionResult = asyncio.run(extract("https://example.com"))
54
+
55
+ if result.success:
56
+ print(result.title) # Page title
57
+ print(result.body) # Clean body text
58
+ print(result.fetch_method) # Which strategy succeeded
59
+ print(result.content_length) # Length of body in characters
60
+ print(result.elapsed_ms) # Total time in milliseconds
61
+ print(result.status_code) # HTTP status code (if available)
62
+ else:
63
+ print(result.error) # Human-readable error message
64
+ ```
65
+
66
+ #### Limiting fetch strategies
67
+
68
+ Use the `max_tier` parameter to control how far the library escalates:
69
+
70
+ ```python
71
+ # Only use fast HTTP fetch (no browser, no trafilatura)
72
+ result = await extract("https://example.com", max_tier=1)
73
+
74
+ # Use HTTP fetch + trafilatura, but skip the browser
75
+ result = await extract("https://example.com", max_tier=2)
76
+
77
+ # Use all strategies including headless browser (default)
78
+ result = await extract("https://example.com", max_tier=3)
79
+ ```
80
+
81
+ #### Serialization
82
+
83
+ Results can be converted to dictionaries or JSON:
84
+
85
+ ```python
86
+ result.to_dict() # Returns a plain dict
87
+ result.to_json() # Returns a JSON string
88
+ ```
89
+
90
+ #### Using in async code
91
+
92
+ ```python
93
+ import asyncio
94
+ from daz_web_extract import extract
95
+
96
+ async def main():
97
+ urls = [
98
+ "https://example.com",
99
+ "https://www.iana.org/help/example-domains",
100
+ ]
101
+ results = await asyncio.gather(*[extract(url) for url in urls])
102
+ for r in results:
103
+ print(f"{r.url}: {r.title} ({r.content_length} chars)")
104
+
105
+ asyncio.run(main())
106
+ ```
107
+
108
+ ### Command Line
109
+
110
+ Extract content from a URL and print the result:
111
+
112
+ ```bash
113
+ python run_cli.py extract https://example.com
114
+ ```
115
+
116
+ Output:
117
+
118
+ ```
119
+ Title: Example Domain
120
+ Method: httpx
121
+ Length: 217 chars
122
+ Time: 142ms
123
+
124
+ Example Domain
125
+ This domain is for use in illustrative examples in documents. You may use this domain
126
+ in literature without prior coordination or asking for permission.
127
+ More information...
128
+ ```
129
+
130
+ Get raw JSON output:
131
+
132
+ ```bash
133
+ python run_cli.py extract https://example.com --raw
134
+ ```
135
+
136
+ Output:
137
+
138
+ ```json
139
+ {
140
+ "success": true,
141
+ "url": "https://example.com",
142
+ "title": "Example Domain",
143
+ "body": "Example Domain\nThis domain is for use in ...",
144
+ "error": null,
145
+ "fetch_method": "httpx",
146
+ "status_code": 200,
147
+ "content_length": 217,
148
+ "elapsed_ms": 142
149
+ }
150
+ ```
151
+
152
+ ### Using via the `run` script
153
+
154
+ The project includes a `run` script that automatically activates the virtual environment:
155
+
156
+ ```bash
157
+ # Extract content
158
+ ./run extract https://example.com
159
+ ./run extract https://example.com --raw
160
+
161
+ # Run tests
162
+ ./run test src/daz_web_extract/result_test.py
163
+
164
+ # Run linter
165
+ ./run lint
166
+
167
+ # Run full quality checks
168
+ ./run check
169
+ ```
170
+
171
+ ## Development
172
+
173
+ Set up a development environment:
174
+
175
+ ```bash
176
+ python -m venv .venv
177
+ source .venv/bin/activate
178
+ pip install -e ".[dev]"
179
+ playwright install chromium
180
+ ```
181
+
182
+ Run the tests:
183
+
184
+ ```bash
185
+ pytest -q src/
186
+ ```
@@ -0,0 +1,164 @@
1
+ ![](banner.jpg)
2
+
3
+ # daz-web-extract
4
+
5
+ Async Python library that extracts clean title and body text from any URL. It automatically escalates through multiple fetch strategies to handle everything from simple static pages to JavaScript-rendered content. It never throws exceptions — every call returns a structured result indicating success or failure.
6
+
7
+ ## Installation
8
+
9
+ Requires Python 3.12+.
10
+
11
+ ```bash
12
+ pip install daz-web-extract
13
+ ```
14
+
15
+ After installing, set up the browser engine for pages that require JavaScript rendering:
16
+
17
+ ```bash
18
+ playwright install chromium
19
+ ```
20
+
21
+ ## Usage
22
+
23
+ ### Python API
24
+
25
+ The library exposes a single async function `extract` and a result type `ExtractionResult`.
26
+
27
+ ```python
28
+ import asyncio
29
+ from daz_web_extract import extract, ExtractionResult
30
+
31
+ result: ExtractionResult = asyncio.run(extract("https://example.com"))
32
+
33
+ if result.success:
34
+ print(result.title) # Page title
35
+ print(result.body) # Clean body text
36
+ print(result.fetch_method) # Which strategy succeeded
37
+ print(result.content_length) # Length of body in characters
38
+ print(result.elapsed_ms) # Total time in milliseconds
39
+ print(result.status_code) # HTTP status code (if available)
40
+ else:
41
+ print(result.error) # Human-readable error message
42
+ ```
43
+
44
+ #### Limiting fetch strategies
45
+
46
+ Use the `max_tier` parameter to control how far the library escalates:
47
+
48
+ ```python
49
+ # Only use fast HTTP fetch (no browser, no trafilatura)
50
+ result = await extract("https://example.com", max_tier=1)
51
+
52
+ # Use HTTP fetch + trafilatura, but skip the browser
53
+ result = await extract("https://example.com", max_tier=2)
54
+
55
+ # Use all strategies including headless browser (default)
56
+ result = await extract("https://example.com", max_tier=3)
57
+ ```
58
+
59
+ #### Serialization
60
+
61
+ Results can be converted to dictionaries or JSON:
62
+
63
+ ```python
64
+ result.to_dict() # Returns a plain dict
65
+ result.to_json() # Returns a JSON string
66
+ ```
67
+
68
+ #### Using in async code
69
+
70
+ ```python
71
+ import asyncio
72
+ from daz_web_extract import extract
73
+
74
+ async def main():
75
+ urls = [
76
+ "https://example.com",
77
+ "https://www.iana.org/help/example-domains",
78
+ ]
79
+ results = await asyncio.gather(*[extract(url) for url in urls])
80
+ for r in results:
81
+ print(f"{r.url}: {r.title} ({r.content_length} chars)")
82
+
83
+ asyncio.run(main())
84
+ ```
85
+
86
+ ### Command Line
87
+
88
+ Extract content from a URL and print the result:
89
+
90
+ ```bash
91
+ python run_cli.py extract https://example.com
92
+ ```
93
+
94
+ Output:
95
+
96
+ ```
97
+ Title: Example Domain
98
+ Method: httpx
99
+ Length: 217 chars
100
+ Time: 142ms
101
+
102
+ Example Domain
103
+ This domain is for use in illustrative examples in documents. You may use this domain
104
+ in literature without prior coordination or asking for permission.
105
+ More information...
106
+ ```
107
+
108
+ Get raw JSON output:
109
+
110
+ ```bash
111
+ python run_cli.py extract https://example.com --raw
112
+ ```
113
+
114
+ Output:
115
+
116
+ ```json
117
+ {
118
+ "success": true,
119
+ "url": "https://example.com",
120
+ "title": "Example Domain",
121
+ "body": "Example Domain\nThis domain is for use in ...",
122
+ "error": null,
123
+ "fetch_method": "httpx",
124
+ "status_code": 200,
125
+ "content_length": 217,
126
+ "elapsed_ms": 142
127
+ }
128
+ ```
129
+
130
+ ### Using via the `run` script
131
+
132
+ The project includes a `run` script that automatically activates the virtual environment:
133
+
134
+ ```bash
135
+ # Extract content
136
+ ./run extract https://example.com
137
+ ./run extract https://example.com --raw
138
+
139
+ # Run tests
140
+ ./run test src/daz_web_extract/result_test.py
141
+
142
+ # Run linter
143
+ ./run lint
144
+
145
+ # Run full quality checks
146
+ ./run check
147
+ ```
148
+
149
+ ## Development
150
+
151
+ Set up a development environment:
152
+
153
+ ```bash
154
+ python -m venv .venv
155
+ source .venv/bin/activate
156
+ pip install -e ".[dev]"
157
+ playwright install chromium
158
+ ```
159
+
160
+ Run the tests:
161
+
162
+ ```bash
163
+ pytest -q src/
164
+ ```
@@ -0,0 +1,46 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "daz-web-extract"
7
+ version = "0.2.0"
8
+ description = "Async web content extraction library with three-tier fetch strategy"
9
+ requires-python = ">=3.12"
10
+ license = "MIT"
11
+ authors = [
12
+ {name = "Darren Oakey", email = "darren@oakey.net"},
13
+ ]
14
+ readme = "README.md"
15
+ classifiers = [
16
+ "Programming Language :: Python :: 3",
17
+ "Operating System :: OS Independent",
18
+ ]
19
+ dependencies = [
20
+ "httpx>=0.28.0",
21
+ "lxml>=6.0.0",
22
+ "trafilatura>=1.6.0",
23
+ "playwright>=1.40.0",
24
+ "setproctitle>=1.3.0",
25
+ ]
26
+
27
+ [project.urls]
28
+ Homepage = "https://github.com/darrenoakey/daz-web-extract"
29
+ Repository = "https://github.com/darrenoakey/daz-web-extract"
30
+
31
+ [project.optional-dependencies]
32
+ dev = [
33
+ "pytest>=7.4.0",
34
+ "pytest-asyncio>=0.21.0",
35
+ "ruff>=0.1.0",
36
+ ]
37
+
38
+ [tool.setuptools.packages.find]
39
+ where = ["src"]
40
+
41
+ [tool.pytest.ini_options]
42
+ asyncio_mode = "auto"
43
+ testpaths = ["src"]
44
+
45
+ [tool.ruff]
46
+ line-length = 120
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,4 @@
1
+ from daz_web_extract.result import ExtractionResult
2
+ from daz_web_extract.extract import extract
3
+
4
+ __all__ = ["extract", "ExtractionResult"]
@@ -0,0 +1,134 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+
5
+ import lxml.html
6
+
7
+
8
+ NOISE_TAGS = {
9
+ "script", "style", "nav", "footer", "aside", "header", "noscript",
10
+ "iframe", "form", "svg", "button", "select", "option", "textarea",
11
+ "input", "label", "fieldset", "legend", "dialog", "menu", "menuitem",
12
+ "details", "summary",
13
+ }
14
+ NOISE_CLASSES = {
15
+ "ad", "ads", "advert", "advertisement", "banner", "sponsor", "sponsored",
16
+ "promo", "promotion", "sidebar", "widget", "social", "share", "sharing",
17
+ "cookie", "consent", "popup", "modal", "overlay", "newsletter",
18
+ "subscribe", "signup", "sign-up", "cta", "call-to-action",
19
+ "related", "recommended", "trending", "popular", "breadcrumb",
20
+ "pagination", "pager", "toolbar", "tooltip", "dropdown",
21
+ "comment", "comments", "disqus",
22
+ }
23
+ NOISE_IDS = {
24
+ "ad", "ads", "sidebar", "cookie-banner", "newsletter",
25
+ "comments", "disqus_thread", "social-share",
26
+ }
27
+ NOISE_ROLES = {"navigation", "banner", "complementary", "contentinfo", "form", "search", "menu", "menubar"}
28
+ CONTENT_TAGS = {"p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "blockquote", "td", "th", "figcaption", "pre", "dd"}
29
+ MIN_BLOCK_LENGTH = 15
30
+ MIN_BODY_LENGTH = 100
31
+ TITLE_SUFFIX_RE = re.compile(r"\s*[\|\-\u2013\u2014]\s*[^|\-\u2013\u2014]+$")
32
+
33
+
34
+ # ##################################################################
35
+ # parse html
36
+ # convert raw html bytes or string into an lxml element tree
37
+ def parse_html(html: str | bytes) -> lxml.html.HtmlElement:
38
+ if isinstance(html, bytes):
39
+ html = html.decode("utf-8", errors="replace")
40
+ return lxml.html.fromstring(html)
41
+
42
+
43
+ # ##################################################################
44
+ # extract title
45
+ # pull the best title from html using priority: og:title > <title> > first h1
46
+ def extract_title(tree: lxml.html.HtmlElement) -> str | None:
47
+ og = tree.xpath('//meta[@property="og:title"]/@content')
48
+ if og and og[0].strip():
49
+ return og[0].strip()
50
+
51
+ title_els = tree.xpath("//title/text()")
52
+ if title_els:
53
+ raw = title_els[0].strip()
54
+ if raw:
55
+ return _clean_title_suffix(raw)
56
+
57
+ h1_els = tree.xpath("//h1//text()")
58
+ if h1_els:
59
+ combined = " ".join(t.strip() for t in h1_els if t.strip())
60
+ if combined:
61
+ return combined
62
+
63
+ return None
64
+
65
+
66
+ # ##################################################################
67
+ # clean title suffix
68
+ # remove common site name suffixes like " | SiteName" or " - SiteName"
69
+ def _clean_title_suffix(title: str) -> str:
70
+ cleaned = TITLE_SUFFIX_RE.sub("", title)
71
+ return cleaned if cleaned.strip() else title
72
+
73
+
74
+ # ##################################################################
75
+ # extract text content
76
+ # pull clean article text from html by collecting text from content
77
+ # tags after removing all noise elements; link text is preserved but
78
+ # all html formatting is stripped to produce plain text
79
+ def extract_text_content(tree: lxml.html.HtmlElement) -> str | None:
80
+ _remove_noise(tree)
81
+ blocks = _collect_blocks(tree)
82
+ filtered = [b for b in blocks if len(b) >= MIN_BLOCK_LENGTH]
83
+ body = "\n\n".join(filtered)
84
+ if len(body) < MIN_BODY_LENGTH:
85
+ return None
86
+ return body
87
+
88
+
89
+ # ##################################################################
90
+ # remove noise
91
+ # strip script, style, nav, footer, ads, forms, and other noise
92
+ # elements from tree using tags, class names, ids, and aria roles
93
+ def _remove_noise(tree: lxml.html.HtmlElement) -> None:
94
+ to_remove = []
95
+ for el in tree.xpath("//*"):
96
+ if _is_noise_element(el):
97
+ to_remove.append(el)
98
+ for el in to_remove:
99
+ parent = el.getparent()
100
+ if parent is not None:
101
+ parent.remove(el)
102
+
103
+
104
+ # ##################################################################
105
+ # is noise element
106
+ # check whether an element is noise by tag, class, id, or role
107
+ def _is_noise_element(el: lxml.html.HtmlElement) -> bool:
108
+ if el.tag in NOISE_TAGS:
109
+ return True
110
+ classes = set(el.get("class", "").lower().split())
111
+ if classes & NOISE_CLASSES:
112
+ return True
113
+ el_id = el.get("id", "").lower()
114
+ if el_id in NOISE_IDS:
115
+ return True
116
+ role = el.get("role", "").lower()
117
+ if role in NOISE_ROLES:
118
+ return True
119
+ return False
120
+
121
+
122
+ # ##################################################################
123
+ # collect blocks
124
+ # gather text content from paragraph-like elements; link text within
125
+ # paragraphs is preserved since links are part of article content
126
+ def _collect_blocks(tree: lxml.html.HtmlElement) -> list[str]:
127
+ blocks: list[str] = []
128
+ for el in tree.xpath("//*"):
129
+ if el.tag in CONTENT_TAGS:
130
+ text = el.text_content().strip()
131
+ text = re.sub(r"\s+", " ", text)
132
+ if text:
133
+ blocks.append(text)
134
+ return blocks