pagepull 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pagepull-0.1.0/PKG-INFO +205 -0
- pagepull-0.1.0/README.md +186 -0
- pagepull-0.1.0/pyproject.toml +30 -0
- pagepull-0.1.0/src/pagepull/__init__.py +3 -0
- pagepull-0.1.0/src/pagepull/cli.py +99 -0
- pagepull-0.1.0/src/pagepull/commands/__init__.py +0 -0
- pagepull-0.1.0/src/pagepull/commands/div.py +29 -0
- pagepull-0.1.0/src/pagepull/commands/select.py +34 -0
- pagepull-0.1.0/src/pagepull/commands/strip.py +24 -0
- pagepull-0.1.0/src/pagepull/commands/text.py +27 -0
- pagepull-0.1.0/src/pagepull/output.py +41 -0
- pagepull-0.1.0/src/pagepull/source.py +43 -0
pagepull-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pagepull
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Extract and transform HTML page content with composable CLI tools
|
|
5
|
+
License: MIT
|
|
6
|
+
Author: Neil Johnson
|
|
7
|
+
Author-email: neil@cadent.com
|
|
8
|
+
Requires-Python: >=3.11,<4.0
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
15
|
+
Requires-Dist: beautifulsoup4 (>=4.12,<5.0)
|
|
16
|
+
Requires-Dist: requests (>=2.31,<3.0)
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# pagepull
|
|
20
|
+
|
|
21
|
+
Extract structured data from HTML pages via the command line.
|
|
22
|
+
|
|
23
|
+
pagepull wraps [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) behind a simple CLI, turning common DOM extraction tasks into one-liners. Think of it as `jq` for HTML.
|
|
24
|
+
|
|
25
|
+
## Install
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install pagepull
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Or with pipx for isolated install:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pipx install pagepull
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Quick Start
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
# Extract the content div from a WordPress page
|
|
41
|
+
pagepull div entry-content https://example.com/about
|
|
42
|
+
|
|
43
|
+
# Same thing, as markdown
|
|
44
|
+
pagepull div entry-content --markdown https://example.com/about
|
|
45
|
+
|
|
46
|
+
# List images and check for missing alt text
|
|
47
|
+
pagepull images --alt https://example.com/about
|
|
48
|
+
|
|
49
|
+
# Pull meta tags
|
|
50
|
+
pagepull meta --title --description https://example.com/about
|
|
51
|
+
|
|
52
|
+
# Use any CSS selector
|
|
53
|
+
pagepull select "nav.primary a" page.html
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Input
|
|
57
|
+
|
|
58
|
+
pagepull accepts three input types:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
# Local file
|
|
62
|
+
pagepull div content page.html
|
|
63
|
+
|
|
64
|
+
# URL (fetched automatically)
|
|
65
|
+
pagepull div content https://example.com/page
|
|
66
|
+
|
|
67
|
+
# stdin
|
|
68
|
+
curl -s https://example.com | pagepull div content
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Commands
|
|
72
|
+
|
|
73
|
+
### `div` — Extract a div by class or id
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pagepull div entry-content page.html
|
|
77
|
+
pagepull div sidebar --by id page.html
|
|
78
|
+
pagepull div entry-content --strip script,style --markdown page.html
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### `images` — List images with metadata
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
pagepull images page.html
|
|
85
|
+
pagepull images --alt --dimensions page.html
|
|
86
|
+
pagepull images --json page.html
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Flags `--alt` to show alt text (missing alt flagged as `[MISSING]`) and `--dimensions` for width/height.
|
|
90
|
+
|
|
91
|
+
### `meta` — Extract meta tags
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
pagepull meta page.html # all meta tags
|
|
95
|
+
pagepull meta --title --description page.html # specific tags
|
|
96
|
+
pagepull meta --og page.html # Open Graph tags
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### `links` — List all links
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
pagepull links page.html
|
|
103
|
+
pagepull links --external-only page.html
|
|
104
|
+
pagepull links --csv page.html
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### `headings` — Heading hierarchy
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
pagepull headings page.html
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
```
|
|
114
|
+
h1: Welcome to Our Site
|
|
115
|
+
h2: About Us
|
|
116
|
+
h2: Services
|
|
117
|
+
h3: Web Design
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### `text` — Visible text only
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
pagepull text page.html
|
|
124
|
+
pagepull text --selector "div.entry-content" page.html
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### `select` — Raw CSS selector
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
pagepull select "nav a" page.html
|
|
131
|
+
pagepull select "img[alt='']" --json page.html
|
|
132
|
+
pagepull select "h2 + p" --text page.html
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### `strip` — Remove elements
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
pagepull strip script noscript style page.html
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### `table` — Extract HTML tables
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
pagepull table --csv page.html
|
|
145
|
+
pagepull table --index 0 --json page.html
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
## Global Flags
|
|
149
|
+
|
|
150
|
+
| Flag | Description |
|
|
151
|
+
|------|-------------|
|
|
152
|
+
| `--selector <css>` | Scope any command to a CSS selector first |
|
|
153
|
+
| `--json` | Structured JSON output |
|
|
154
|
+
| `--csv` | CSV output (where applicable) |
|
|
155
|
+
| `--markdown` | Convert HTML to markdown |
|
|
156
|
+
| `--quiet` | Suppress headers and labels |
|
|
157
|
+
|
|
158
|
+
## Scoping with `--selector`
|
|
159
|
+
|
|
160
|
+
Any command can be scoped to a portion of the page:
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
# Images only within the article
|
|
164
|
+
pagepull images --alt --selector "article" page.html
|
|
165
|
+
|
|
166
|
+
# Links only in the footer
|
|
167
|
+
pagepull links --selector "footer" page.html
|
|
168
|
+
|
|
169
|
+
# Text from a specific section
|
|
170
|
+
pagepull text --selector "div.entry-content" page.html
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## Pairing with sitewalker
|
|
174
|
+
|
|
175
|
+
pagepull handles one page. [sitewalker](https://github.com/cadentdev/sitewalker) crawls sites. Together they cover site-wide extraction:
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
# Audit alt text across an entire site
|
|
179
|
+
sitewalker -p https://example.com | xargs -I{} pagepull images --alt --json {}
|
|
180
|
+
|
|
181
|
+
# Extract every page title
|
|
182
|
+
sitewalker -p https://example.com | xargs -I{} pagepull meta --title {}
|
|
183
|
+
|
|
184
|
+
# Pull article content as markdown
|
|
185
|
+
sitewalker -p https://example.com | xargs -I{} pagepull div content --markdown {}
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
## Development
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
git clone git@github.com:cadentdev/pagepull.git
|
|
192
|
+
cd pagepull
|
|
193
|
+
poetry install
|
|
194
|
+
poetry run pytest
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
## Requirements
|
|
198
|
+
|
|
199
|
+
- Python 3.11+
|
|
200
|
+
- Dependencies: beautifulsoup4, requests, markdownify
|
|
201
|
+
|
|
202
|
+
## License
|
|
203
|
+
|
|
204
|
+
MIT
|
|
205
|
+
|
pagepull-0.1.0/README.md
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# pagepull
|
|
2
|
+
|
|
3
|
+
Extract structured data from HTML pages via the command line.
|
|
4
|
+
|
|
5
|
+
pagepull wraps [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) behind a simple CLI, turning common DOM extraction tasks into one-liners. Think of it as `jq` for HTML.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install pagepull
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Or with pipx for isolated install:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pipx install pagepull
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Quick Start
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
# Extract the content div from a WordPress page
|
|
23
|
+
pagepull div entry-content https://example.com/about
|
|
24
|
+
|
|
25
|
+
# Same thing, as markdown
|
|
26
|
+
pagepull div entry-content --markdown https://example.com/about
|
|
27
|
+
|
|
28
|
+
# List images and check for missing alt text
|
|
29
|
+
pagepull images --alt https://example.com/about
|
|
30
|
+
|
|
31
|
+
# Pull meta tags
|
|
32
|
+
pagepull meta --title --description https://example.com/about
|
|
33
|
+
|
|
34
|
+
# Use any CSS selector
|
|
35
|
+
pagepull select "nav.primary a" page.html
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Input
|
|
39
|
+
|
|
40
|
+
pagepull accepts three input types:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
# Local file
|
|
44
|
+
pagepull div content page.html
|
|
45
|
+
|
|
46
|
+
# URL (fetched automatically)
|
|
47
|
+
pagepull div content https://example.com/page
|
|
48
|
+
|
|
49
|
+
# stdin
|
|
50
|
+
curl -s https://example.com | pagepull div content
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Commands
|
|
54
|
+
|
|
55
|
+
### `div` — Extract a div by class or id
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
pagepull div entry-content page.html
|
|
59
|
+
pagepull div sidebar --by id page.html
|
|
60
|
+
pagepull div entry-content --strip script,style --markdown page.html
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### `images` — List images with metadata
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pagepull images page.html
|
|
67
|
+
pagepull images --alt --dimensions page.html
|
|
68
|
+
pagepull images --json page.html
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Flags `--alt` to show alt text (missing alt flagged as `[MISSING]`) and `--dimensions` for width/height.
|
|
72
|
+
|
|
73
|
+
### `meta` — Extract meta tags
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pagepull meta page.html # all meta tags
|
|
77
|
+
pagepull meta --title --description page.html # specific tags
|
|
78
|
+
pagepull meta --og page.html # Open Graph tags
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### `links` — List all links
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
pagepull links page.html
|
|
85
|
+
pagepull links --external-only page.html
|
|
86
|
+
pagepull links --csv page.html
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### `headings` — Heading hierarchy
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
pagepull headings page.html
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
```
|
|
96
|
+
h1: Welcome to Our Site
|
|
97
|
+
h2: About Us
|
|
98
|
+
h2: Services
|
|
99
|
+
h3: Web Design
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### `text` — Visible text only
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
pagepull text page.html
|
|
106
|
+
pagepull text --selector "div.entry-content" page.html
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### `select` — Raw CSS selector
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
pagepull select "nav a" page.html
|
|
113
|
+
pagepull select "img[alt='']" --json page.html
|
|
114
|
+
pagepull select "h2 + p" --text page.html
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### `strip` — Remove elements
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
pagepull strip script noscript style page.html
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### `table` — Extract HTML tables
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
pagepull table --csv page.html
|
|
127
|
+
pagepull table --index 0 --json page.html
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Global Flags
|
|
131
|
+
|
|
132
|
+
| Flag | Description |
|
|
133
|
+
|------|-------------|
|
|
134
|
+
| `--selector <css>` | Scope any command to a CSS selector first |
|
|
135
|
+
| `--json` | Structured JSON output |
|
|
136
|
+
| `--csv` | CSV output (where applicable) |
|
|
137
|
+
| `--markdown` | Convert HTML to markdown |
|
|
138
|
+
| `--quiet` | Suppress headers and labels |
|
|
139
|
+
|
|
140
|
+
## Scoping with `--selector`
|
|
141
|
+
|
|
142
|
+
Any command can be scoped to a portion of the page:
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
# Images only within the article
|
|
146
|
+
pagepull images --alt --selector "article" page.html
|
|
147
|
+
|
|
148
|
+
# Links only in the footer
|
|
149
|
+
pagepull links --selector "footer" page.html
|
|
150
|
+
|
|
151
|
+
# Text from a specific section
|
|
152
|
+
pagepull text --selector "div.entry-content" page.html
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## Pairing with sitewalker
|
|
156
|
+
|
|
157
|
+
pagepull handles one page. [sitewalker](https://github.com/cadentdev/sitewalker) crawls sites. Together they cover site-wide extraction:
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
# Audit alt text across an entire site
|
|
161
|
+
sitewalker -p https://example.com | xargs -I{} pagepull images --alt --json {}
|
|
162
|
+
|
|
163
|
+
# Extract every page title
|
|
164
|
+
sitewalker -p https://example.com | xargs -I{} pagepull meta --title {}
|
|
165
|
+
|
|
166
|
+
# Pull article content as markdown
|
|
167
|
+
sitewalker -p https://example.com | xargs -I{} pagepull div content --markdown {}
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
## Development
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
git clone git@github.com:cadentdev/pagepull.git
|
|
174
|
+
cd pagepull
|
|
175
|
+
poetry install
|
|
176
|
+
poetry run pytest
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
## Requirements
|
|
180
|
+
|
|
181
|
+
- Python 3.11+
|
|
182
|
+
- Dependencies: beautifulsoup4, requests, markdownify
|
|
183
|
+
|
|
184
|
+
## License
|
|
185
|
+
|
|
186
|
+
MIT
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "pagepull"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Extract and transform HTML page content with composable CLI tools"
|
|
5
|
+
authors = ["Neil Johnson <neil@cadent.com>"]
|
|
6
|
+
license = "MIT"
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
packages = [{include = "pagepull", from = "src"}]
|
|
9
|
+
|
|
10
|
+
[tool.poetry.dependencies]
|
|
11
|
+
python = "^3.11"
|
|
12
|
+
beautifulsoup4 = "^4.12"
|
|
13
|
+
requests = "^2.31"
|
|
14
|
+
|
|
15
|
+
[tool.poetry.group.dev.dependencies]
|
|
16
|
+
pytest = "^8.0"
|
|
17
|
+
pytest-cov = "^5.0"
|
|
18
|
+
|
|
19
|
+
[tool.poetry.scripts]
|
|
20
|
+
pagepull = "pagepull.cli:main"
|
|
21
|
+
|
|
22
|
+
[build-system]
|
|
23
|
+
requires = ["poetry-core"]
|
|
24
|
+
build-backend = "poetry.core.masonry.api"
|
|
25
|
+
|
|
26
|
+
[tool.pytest.ini_options]
|
|
27
|
+
testpaths = ["tests"]
|
|
28
|
+
|
|
29
|
+
[tool.coverage.run]
|
|
30
|
+
source = ["pagepull"]
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""CLI entry point for pagepull."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
from bs4 import BeautifulSoup
|
|
9
|
+
|
|
10
|
+
from pagepull import __version__
|
|
11
|
+
from pagepull.commands import div, select, strip, text
|
|
12
|
+
from pagepull.output import format_output
|
|
13
|
+
from pagepull.source import load_source
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
17
|
+
"""Build the argument parser with subcommands."""
|
|
18
|
+
parser = argparse.ArgumentParser(
|
|
19
|
+
prog="pagepull",
|
|
20
|
+
description="Extract structured data from HTML pages.",
|
|
21
|
+
)
|
|
22
|
+
parser.add_argument("--version", action="version", version=f"pagepull {__version__}")
|
|
23
|
+
parser.add_argument("--json", dest="as_json", action="store_true", help="output as JSON")
|
|
24
|
+
parser.add_argument("--quiet", "-q", action="store_true", help="suppress headers and labels")
|
|
25
|
+
parser.add_argument("--selector", "-s", metavar="CSS", help="scope to CSS selector before command")
|
|
26
|
+
|
|
27
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
28
|
+
|
|
29
|
+
# div
|
|
30
|
+
p_div = sub.add_parser("div", help="extract a div by class or id")
|
|
31
|
+
p_div.add_argument("name", help="class or id name to match")
|
|
32
|
+
p_div.add_argument("source", nargs="?", default=None, help="file, URL, or omit for stdin")
|
|
33
|
+
p_div.add_argument("--by", choices=["class", "id"], default="class", help="match by class or id")
|
|
34
|
+
|
|
35
|
+
# select
|
|
36
|
+
p_sel = sub.add_parser("select", help="select elements with CSS selector")
|
|
37
|
+
p_sel.add_argument("css", help="CSS selector")
|
|
38
|
+
p_sel.add_argument("source", nargs="?", default=None, help="file, URL, or omit for stdin")
|
|
39
|
+
p_sel.add_argument("--text", dest="text_only", action="store_true", help="text content only")
|
|
40
|
+
p_sel.add_argument("--attr", metavar="NAME", help="extract specific attribute")
|
|
41
|
+
|
|
42
|
+
# strip
|
|
43
|
+
p_strip = sub.add_parser("strip", help="remove elements from HTML")
|
|
44
|
+
p_strip.add_argument("elements", nargs="+", help="tag names to remove")
|
|
45
|
+
p_strip.add_argument("source", nargs="?", default=None, help="file, URL, or omit for stdin")
|
|
46
|
+
|
|
47
|
+
# text
|
|
48
|
+
p_text = sub.add_parser("text", help="extract visible text")
|
|
49
|
+
p_text.add_argument("source", nargs="?", default=None, help="file, URL, or omit for stdin")
|
|
50
|
+
|
|
51
|
+
return parser
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def apply_selector(html: str, selector: str) -> str:
|
|
55
|
+
"""Scope HTML to elements matching a CSS selector."""
|
|
56
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
57
|
+
matches = soup.select(selector)
|
|
58
|
+
if not matches:
|
|
59
|
+
return ""
|
|
60
|
+
return "\n".join(str(m) for m in matches)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def main(argv: list[str] | None = None) -> None:
|
|
64
|
+
"""Main CLI entry point."""
|
|
65
|
+
parser = build_parser()
|
|
66
|
+
args = parser.parse_args(argv)
|
|
67
|
+
|
|
68
|
+
html = load_source(args.source if hasattr(args, "source") else None)
|
|
69
|
+
|
|
70
|
+
if args.selector:
|
|
71
|
+
html = apply_selector(html, args.selector)
|
|
72
|
+
if not html:
|
|
73
|
+
print("No elements matched --selector", file=sys.stderr)
|
|
74
|
+
sys.exit(1)
|
|
75
|
+
|
|
76
|
+
if args.command == "div":
|
|
77
|
+
result = div.run(html, args.name, by=args.by)
|
|
78
|
+
if result is None:
|
|
79
|
+
print(f"Error: div not found: {args.name}", file=sys.stderr)
|
|
80
|
+
sys.exit(1)
|
|
81
|
+
output = format_output(result, as_json=args.as_json, quiet=args.quiet, label=f"div.{args.name}")
|
|
82
|
+
|
|
83
|
+
elif args.command == "select":
|
|
84
|
+
result = select.run(html, args.css, text_only=args.text_only, attr=args.attr)
|
|
85
|
+
output = format_output(result, as_json=args.as_json, quiet=args.quiet, label=f"select: {args.css}")
|
|
86
|
+
|
|
87
|
+
elif args.command == "strip":
|
|
88
|
+
result = strip.run(html, args.elements)
|
|
89
|
+
output = format_output(result, as_json=args.as_json, quiet=args.quiet)
|
|
90
|
+
|
|
91
|
+
elif args.command == "text":
|
|
92
|
+
result = text.run(html)
|
|
93
|
+
output = format_output(result, as_json=args.as_json, quiet=args.quiet)
|
|
94
|
+
|
|
95
|
+
else:
|
|
96
|
+
parser.print_help()
|
|
97
|
+
sys.exit(1)
|
|
98
|
+
|
|
99
|
+
print(output)
|
|
File without changes
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Extract a div by class or id."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def run(html: str, name: str, by: str = "class") -> str | None:
|
|
9
|
+
"""Extract inner HTML of a div matched by class or id.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
html: HTML content to search.
|
|
13
|
+
name: Class name or id to match.
|
|
14
|
+
by: Match method — "class" or "id".
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
Inner HTML of matched div, or None if not found.
|
|
18
|
+
"""
|
|
19
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
20
|
+
|
|
21
|
+
if by == "id":
|
|
22
|
+
tag = soup.find("div", id=name)
|
|
23
|
+
else:
|
|
24
|
+
tag = soup.find("div", class_=name)
|
|
25
|
+
|
|
26
|
+
if tag is None:
|
|
27
|
+
return None
|
|
28
|
+
|
|
29
|
+
return tag.decode_contents()
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Select elements using CSS selectors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def run(
|
|
9
|
+
html: str,
|
|
10
|
+
selector: str,
|
|
11
|
+
text_only: bool = False,
|
|
12
|
+
attr: str | None = None,
|
|
13
|
+
) -> list[str]:
|
|
14
|
+
"""Select elements matching a CSS selector.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
html: HTML content to search.
|
|
18
|
+
selector: CSS selector string.
|
|
19
|
+
text_only: If True, return text content instead of HTML.
|
|
20
|
+
attr: If set, return this attribute's value from each match.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
List of matched element strings, text content, or attribute values.
|
|
24
|
+
"""
|
|
25
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
26
|
+
matches = soup.select(selector)
|
|
27
|
+
|
|
28
|
+
if attr:
|
|
29
|
+
return [tag.get(attr, "") for tag in matches]
|
|
30
|
+
|
|
31
|
+
if text_only:
|
|
32
|
+
return [tag.get_text(strip=True) for tag in matches]
|
|
33
|
+
|
|
34
|
+
return [str(tag) for tag in matches]
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Remove specified HTML elements and output cleaned HTML."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def run(html: str, elements: list[str]) -> str:
|
|
9
|
+
"""Remove all instances of specified elements from HTML.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
html: HTML content to clean.
|
|
13
|
+
elements: Tag names to remove (e.g., ["script", "style"]).
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
Cleaned HTML string.
|
|
17
|
+
"""
|
|
18
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
19
|
+
|
|
20
|
+
for tag_name in elements:
|
|
21
|
+
for tag in soup.find_all(tag_name):
|
|
22
|
+
tag.decompose()
|
|
23
|
+
|
|
24
|
+
return str(soup)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Extract visible text from HTML."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def run(html: str) -> str:
|
|
9
|
+
"""Extract visible text content, stripping all HTML tags.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
html: HTML content to extract text from.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
Visible text with whitespace normalized.
|
|
16
|
+
"""
|
|
17
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
18
|
+
|
|
19
|
+
# Remove script and style elements
|
|
20
|
+
for tag in soup(["script", "style"]):
|
|
21
|
+
tag.decompose()
|
|
22
|
+
|
|
23
|
+
text = soup.get_text(separator="\n")
|
|
24
|
+
|
|
25
|
+
# Normalize whitespace: strip each line, remove blank lines
|
|
26
|
+
lines = [line.strip() for line in text.splitlines()]
|
|
27
|
+
return "\n".join(line for line in lines if line)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Output formatting helpers for JSON, quiet, and text modes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def format_output(
|
|
9
|
+
data: str | list[str] | list[dict],
|
|
10
|
+
*,
|
|
11
|
+
as_json: bool = False,
|
|
12
|
+
quiet: bool = False,
|
|
13
|
+
label: str = "",
|
|
14
|
+
) -> str:
|
|
15
|
+
"""Format command output for display.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
data: Raw output — string, list of strings, or list of dicts.
|
|
19
|
+
as_json: Output as JSON.
|
|
20
|
+
quiet: Suppress labels and headers.
|
|
21
|
+
label: Header label (ignored if quiet).
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Formatted string ready for printing.
|
|
25
|
+
"""
|
|
26
|
+
if as_json:
|
|
27
|
+
if isinstance(data, str):
|
|
28
|
+
return json.dumps({"result": data})
|
|
29
|
+
return json.dumps(data, indent=2)
|
|
30
|
+
|
|
31
|
+
if isinstance(data, list):
|
|
32
|
+
lines = []
|
|
33
|
+
if label and not quiet:
|
|
34
|
+
lines.append(f"{label} ({len(data)} found)")
|
|
35
|
+
for item in data:
|
|
36
|
+
lines.append(str(item))
|
|
37
|
+
return "\n".join(lines)
|
|
38
|
+
|
|
39
|
+
if label and not quiet:
|
|
40
|
+
return f"{label}\n{data}"
|
|
41
|
+
return data
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Load HTML from file, URL, or stdin."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def load_source(source: str | None) -> str:
|
|
11
|
+
"""Load HTML content from a file path, URL, or stdin.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
source: File path, URL (http/https), or None for stdin.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
HTML content as string.
|
|
18
|
+
|
|
19
|
+
Raises:
|
|
20
|
+
FileNotFoundError: If file path doesn't exist.
|
|
21
|
+
SystemExit: If URL fetch fails or stdin is a TTY with no data.
|
|
22
|
+
"""
|
|
23
|
+
if source is None:
|
|
24
|
+
if sys.stdin.isatty():
|
|
25
|
+
print("Error: no input source. Provide a file, URL, or pipe HTML to stdin.", file=sys.stderr)
|
|
26
|
+
sys.exit(1)
|
|
27
|
+
return sys.stdin.read()
|
|
28
|
+
|
|
29
|
+
if source.startswith(("http://", "https://")):
|
|
30
|
+
try:
|
|
31
|
+
resp = requests.get(source, timeout=10)
|
|
32
|
+
resp.raise_for_status()
|
|
33
|
+
except requests.RequestException as e:
|
|
34
|
+
print(f"Error fetching URL: {e}", file=sys.stderr)
|
|
35
|
+
sys.exit(1)
|
|
36
|
+
return resp.text
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
with open(source, encoding="utf-8") as f:
|
|
40
|
+
return f.read()
|
|
41
|
+
except FileNotFoundError:
|
|
42
|
+
print(f"Error: file not found: {source}", file=sys.stderr)
|
|
43
|
+
sys.exit(1)
|