pagepull 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,205 @@
1
+ Metadata-Version: 2.4
2
+ Name: pagepull
3
+ Version: 0.1.0
4
+ Summary: Extract and transform HTML page content with composable CLI tools
5
+ License: MIT
6
+ Author: Neil Johnson
7
+ Author-email: neil@cadent.com
8
+ Requires-Python: >=3.11,<4.0
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Classifier: Programming Language :: Python :: 3.14
15
+ Requires-Dist: beautifulsoup4 (>=4.12,<5.0)
16
+ Requires-Dist: requests (>=2.31,<3.0)
17
+ Description-Content-Type: text/markdown
18
+
19
+ # pagepull
20
+
21
+ Extract structured data from HTML pages via the command line.
22
+
23
+ pagepull wraps [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) behind a simple CLI, turning common DOM extraction tasks into one-liners. Think of it as `jq` for HTML.
24
+
25
+ ## Install
26
+
27
+ ```bash
28
+ pip install pagepull
29
+ ```
30
+
31
+ Or with pipx for isolated install:
32
+
33
+ ```bash
34
+ pipx install pagepull
35
+ ```
36
+
37
+ ## Quick Start
38
+
39
+ ```bash
40
+ # Extract the content div from a WordPress page
41
+ pagepull div entry-content https://example.com/about
42
+
43
+ # Same thing, as markdown
44
+ pagepull div entry-content --markdown https://example.com/about
45
+
46
+ # List images and check for missing alt text
47
+ pagepull images --alt https://example.com/about
48
+
49
+ # Pull meta tags
50
+ pagepull meta --title --description https://example.com/about
51
+
52
+ # Use any CSS selector
53
+ pagepull select "nav.primary a" page.html
54
+ ```
55
+
56
+ ## Input
57
+
58
+ pagepull accepts three input types:
59
+
60
+ ```bash
61
+ # Local file
62
+ pagepull div content page.html
63
+
64
+ # URL (fetched automatically)
65
+ pagepull div content https://example.com/page
66
+
67
+ # stdin
68
+ curl -s https://example.com | pagepull div content
69
+ ```
70
+
71
+ ## Commands
72
+
73
+ ### `div` — Extract a div by class or id
74
+
75
+ ```bash
76
+ pagepull div entry-content page.html
77
+ pagepull div sidebar --by id page.html
78
+ pagepull div entry-content --strip script,style --markdown page.html
79
+ ```
80
+
81
+ ### `images` — List images with metadata
82
+
83
+ ```bash
84
+ pagepull images page.html
85
+ pagepull images --alt --dimensions page.html
86
+ pagepull images --json page.html
87
+ ```
88
+
89
+ Flags `--alt` to show alt text (missing alt flagged as `[MISSING]`) and `--dimensions` for width/height.
90
+
91
+ ### `meta` — Extract meta tags
92
+
93
+ ```bash
94
+ pagepull meta page.html # all meta tags
95
+ pagepull meta --title --description page.html # specific tags
96
+ pagepull meta --og page.html # Open Graph tags
97
+ ```
98
+
99
+ ### `links` — List all links
100
+
101
+ ```bash
102
+ pagepull links page.html
103
+ pagepull links --external-only page.html
104
+ pagepull links --csv page.html
105
+ ```
106
+
107
+ ### `headings` — Heading hierarchy
108
+
109
+ ```bash
110
+ pagepull headings page.html
111
+ ```
112
+
113
+ ```
114
+ h1: Welcome to Our Site
115
+ h2: About Us
116
+ h2: Services
117
+ h3: Web Design
118
+ ```
119
+
120
+ ### `text` — Visible text only
121
+
122
+ ```bash
123
+ pagepull text page.html
124
+ pagepull text --selector "div.entry-content" page.html
125
+ ```
126
+
127
+ ### `select` — Raw CSS selector
128
+
129
+ ```bash
130
+ pagepull select "nav a" page.html
131
+ pagepull select "img[alt='']" --json page.html
132
+ pagepull select "h2 + p" --text page.html
133
+ ```
134
+
135
+ ### `strip` — Remove elements
136
+
137
+ ```bash
138
+ pagepull strip script noscript style page.html
139
+ ```
140
+
141
+ ### `table` — Extract HTML tables
142
+
143
+ ```bash
144
+ pagepull table --csv page.html
145
+ pagepull table --index 0 --json page.html
146
+ ```
147
+
148
+ ## Global Flags
149
+
150
+ | Flag | Description |
151
+ |------|-------------|
152
+ | `--selector <css>` | Scope any command to a CSS selector first |
153
+ | `--json` | Structured JSON output |
154
+ | `--csv` | CSV output (where applicable) |
155
+ | `--markdown` | Convert HTML to markdown |
156
+ | `--quiet` | Suppress headers and labels |
157
+
158
+ ## Scoping with `--selector`
159
+
160
+ Any command can be scoped to a portion of the page:
161
+
162
+ ```bash
163
+ # Images only within the article
164
+ pagepull images --alt --selector "article" page.html
165
+
166
+ # Links only in the footer
167
+ pagepull links --selector "footer" page.html
168
+
169
+ # Text from a specific section
170
+ pagepull text --selector "div.entry-content" page.html
171
+ ```
172
+
173
+ ## Pairing with sitewalker
174
+
175
+ pagepull handles one page. [sitewalker](https://github.com/cadentdev/sitewalker) crawls sites. Together they cover site-wide extraction:
176
+
177
+ ```bash
178
+ # Audit alt text across an entire site
179
+ sitewalker -p https://example.com | xargs -I{} pagepull images --alt --json {}
180
+
181
+ # Extract every page title
182
+ sitewalker -p https://example.com | xargs -I{} pagepull meta --title {}
183
+
184
+ # Pull article content as markdown
185
+ sitewalker -p https://example.com | xargs -I{} pagepull div content --markdown {}
186
+ ```
187
+
188
+ ## Development
189
+
190
+ ```bash
191
+ git clone git@github.com:cadentdev/pagepull.git
192
+ cd pagepull
193
+ poetry install
194
+ poetry run pytest
195
+ ```
196
+
197
+ ## Requirements
198
+
199
+ - Python 3.11+
200
+ - Dependencies: beautifulsoup4, requests, markdownify
201
+
202
+ ## License
203
+
204
+ MIT
205
+
@@ -0,0 +1,186 @@
1
+ # pagepull
2
+
3
+ Extract structured data from HTML pages via the command line.
4
+
5
+ pagepull wraps [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) behind a simple CLI, turning common DOM extraction tasks into one-liners. Think of it as `jq` for HTML.
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pip install pagepull
11
+ ```
12
+
13
+ Or with pipx for isolated install:
14
+
15
+ ```bash
16
+ pipx install pagepull
17
+ ```
18
+
19
+ ## Quick Start
20
+
21
+ ```bash
22
+ # Extract the content div from a WordPress page
23
+ pagepull div entry-content https://example.com/about
24
+
25
+ # Same thing, as markdown
26
+ pagepull div entry-content --markdown https://example.com/about
27
+
28
+ # List images and check for missing alt text
29
+ pagepull images --alt https://example.com/about
30
+
31
+ # Pull meta tags
32
+ pagepull meta --title --description https://example.com/about
33
+
34
+ # Use any CSS selector
35
+ pagepull select "nav.primary a" page.html
36
+ ```
37
+
38
+ ## Input
39
+
40
+ pagepull accepts three input types:
41
+
42
+ ```bash
43
+ # Local file
44
+ pagepull div content page.html
45
+
46
+ # URL (fetched automatically)
47
+ pagepull div content https://example.com/page
48
+
49
+ # stdin
50
+ curl -s https://example.com | pagepull div content
51
+ ```
52
+
53
+ ## Commands
54
+
55
+ ### `div` — Extract a div by class or id
56
+
57
+ ```bash
58
+ pagepull div entry-content page.html
59
+ pagepull div sidebar --by id page.html
60
+ pagepull div entry-content --strip script,style --markdown page.html
61
+ ```
62
+
63
+ ### `images` — List images with metadata
64
+
65
+ ```bash
66
+ pagepull images page.html
67
+ pagepull images --alt --dimensions page.html
68
+ pagepull images --json page.html
69
+ ```
70
+
71
+ Flags `--alt` to show alt text (missing alt flagged as `[MISSING]`) and `--dimensions` for width/height.
72
+
73
+ ### `meta` — Extract meta tags
74
+
75
+ ```bash
76
+ pagepull meta page.html # all meta tags
77
+ pagepull meta --title --description page.html # specific tags
78
+ pagepull meta --og page.html # Open Graph tags
79
+ ```
80
+
81
+ ### `links` — List all links
82
+
83
+ ```bash
84
+ pagepull links page.html
85
+ pagepull links --external-only page.html
86
+ pagepull links --csv page.html
87
+ ```
88
+
89
+ ### `headings` — Heading hierarchy
90
+
91
+ ```bash
92
+ pagepull headings page.html
93
+ ```
94
+
95
+ ```
96
+ h1: Welcome to Our Site
97
+ h2: About Us
98
+ h2: Services
99
+ h3: Web Design
100
+ ```
101
+
102
+ ### `text` — Visible text only
103
+
104
+ ```bash
105
+ pagepull text page.html
106
+ pagepull text --selector "div.entry-content" page.html
107
+ ```
108
+
109
+ ### `select` — Raw CSS selector
110
+
111
+ ```bash
112
+ pagepull select "nav a" page.html
113
+ pagepull select "img[alt='']" --json page.html
114
+ pagepull select "h2 + p" --text page.html
115
+ ```
116
+
117
+ ### `strip` — Remove elements
118
+
119
+ ```bash
120
+ pagepull strip script noscript style page.html
121
+ ```
122
+
123
+ ### `table` — Extract HTML tables
124
+
125
+ ```bash
126
+ pagepull table --csv page.html
127
+ pagepull table --index 0 --json page.html
128
+ ```
129
+
130
+ ## Global Flags
131
+
132
+ | Flag | Description |
133
+ |------|-------------|
134
+ | `--selector <css>` | Scope any command to a CSS selector first |
135
+ | `--json` | Structured JSON output |
136
+ | `--csv` | CSV output (where applicable) |
137
+ | `--markdown` | Convert HTML to markdown |
138
+ | `--quiet` | Suppress headers and labels |
139
+
140
+ ## Scoping with `--selector`
141
+
142
+ Any command can be scoped to a portion of the page:
143
+
144
+ ```bash
145
+ # Images only within the article
146
+ pagepull images --alt --selector "article" page.html
147
+
148
+ # Links only in the footer
149
+ pagepull links --selector "footer" page.html
150
+
151
+ # Text from a specific section
152
+ pagepull text --selector "div.entry-content" page.html
153
+ ```
154
+
155
+ ## Pairing with sitewalker
156
+
157
+ pagepull handles one page. [sitewalker](https://github.com/cadentdev/sitewalker) crawls sites. Together they cover site-wide extraction:
158
+
159
+ ```bash
160
+ # Audit alt text across an entire site
161
+ sitewalker -p https://example.com | xargs -I{} pagepull images --alt --json {}
162
+
163
+ # Extract every page title
164
+ sitewalker -p https://example.com | xargs -I{} pagepull meta --title {}
165
+
166
+ # Pull article content as markdown
167
+ sitewalker -p https://example.com | xargs -I{} pagepull div content --markdown {}
168
+ ```
169
+
170
+ ## Development
171
+
172
+ ```bash
173
+ git clone git@github.com:cadentdev/pagepull.git
174
+ cd pagepull
175
+ poetry install
176
+ poetry run pytest
177
+ ```
178
+
179
+ ## Requirements
180
+
181
+ - Python 3.11+
182
+ - Dependencies: beautifulsoup4, requests, markdownify
183
+
184
+ ## License
185
+
186
+ MIT
@@ -0,0 +1,30 @@
1
+ [tool.poetry]
2
+ name = "pagepull"
3
+ version = "0.1.0"
4
+ description = "Extract and transform HTML page content with composable CLI tools"
5
+ authors = ["Neil Johnson <neil@cadent.com>"]
6
+ license = "MIT"
7
+ readme = "README.md"
8
+ packages = [{include = "pagepull", from = "src"}]
9
+
10
+ [tool.poetry.dependencies]
11
+ python = "^3.11"
12
+ beautifulsoup4 = "^4.12"
13
+ requests = "^2.31"
14
+
15
+ [tool.poetry.group.dev.dependencies]
16
+ pytest = "^8.0"
17
+ pytest-cov = "^5.0"
18
+
19
+ [tool.poetry.scripts]
20
+ pagepull = "pagepull.cli:main"
21
+
22
+ [build-system]
23
+ requires = ["poetry-core"]
24
+ build-backend = "poetry.core.masonry.api"
25
+
26
+ [tool.pytest.ini_options]
27
+ testpaths = ["tests"]
28
+
29
+ [tool.coverage.run]
30
+ source = ["pagepull"]
@@ -0,0 +1,3 @@
1
+ """pagepull — Extract structured data from HTML pages via the CLI."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,99 @@
1
+ """CLI entry point for pagepull."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import sys
7
+
8
+ from bs4 import BeautifulSoup
9
+
10
+ from pagepull import __version__
11
+ from pagepull.commands import div, select, strip, text
12
+ from pagepull.output import format_output
13
+ from pagepull.source import load_source
14
+
15
+
16
+ def build_parser() -> argparse.ArgumentParser:
17
+ """Build the argument parser with subcommands."""
18
+ parser = argparse.ArgumentParser(
19
+ prog="pagepull",
20
+ description="Extract structured data from HTML pages.",
21
+ )
22
+ parser.add_argument("--version", action="version", version=f"pagepull {__version__}")
23
+ parser.add_argument("--json", dest="as_json", action="store_true", help="output as JSON")
24
+ parser.add_argument("--quiet", "-q", action="store_true", help="suppress headers and labels")
25
+ parser.add_argument("--selector", "-s", metavar="CSS", help="scope to CSS selector before command")
26
+
27
+ sub = parser.add_subparsers(dest="command", required=True)
28
+
29
+ # div
30
+ p_div = sub.add_parser("div", help="extract a div by class or id")
31
+ p_div.add_argument("name", help="class or id name to match")
32
+ p_div.add_argument("source", nargs="?", default=None, help="file, URL, or omit for stdin")
33
+ p_div.add_argument("--by", choices=["class", "id"], default="class", help="match by class or id")
34
+
35
+ # select
36
+ p_sel = sub.add_parser("select", help="select elements with CSS selector")
37
+ p_sel.add_argument("css", help="CSS selector")
38
+ p_sel.add_argument("source", nargs="?", default=None, help="file, URL, or omit for stdin")
39
+ p_sel.add_argument("--text", dest="text_only", action="store_true", help="text content only")
40
+ p_sel.add_argument("--attr", metavar="NAME", help="extract specific attribute")
41
+
42
+ # strip
43
+ p_strip = sub.add_parser("strip", help="remove elements from HTML")
44
+ p_strip.add_argument("elements", nargs="+", help="tag names to remove")
45
+ p_strip.add_argument("source", nargs="?", default=None, help="file, URL, or omit for stdin")
46
+
47
+ # text
48
+ p_text = sub.add_parser("text", help="extract visible text")
49
+ p_text.add_argument("source", nargs="?", default=None, help="file, URL, or omit for stdin")
50
+
51
+ return parser
52
+
53
+
54
+ def apply_selector(html: str, selector: str) -> str:
55
+ """Scope HTML to elements matching a CSS selector."""
56
+ soup = BeautifulSoup(html, "html.parser")
57
+ matches = soup.select(selector)
58
+ if not matches:
59
+ return ""
60
+ return "\n".join(str(m) for m in matches)
61
+
62
+
63
+ def main(argv: list[str] | None = None) -> None:
64
+ """Main CLI entry point."""
65
+ parser = build_parser()
66
+ args = parser.parse_args(argv)
67
+
68
+ html = load_source(args.source if hasattr(args, "source") else None)
69
+
70
+ if args.selector:
71
+ html = apply_selector(html, args.selector)
72
+ if not html:
73
+ print("No elements matched --selector", file=sys.stderr)
74
+ sys.exit(1)
75
+
76
+ if args.command == "div":
77
+ result = div.run(html, args.name, by=args.by)
78
+ if result is None:
79
+ print(f"Error: div not found: {args.name}", file=sys.stderr)
80
+ sys.exit(1)
81
+ output = format_output(result, as_json=args.as_json, quiet=args.quiet, label=f"div.{args.name}")
82
+
83
+ elif args.command == "select":
84
+ result = select.run(html, args.css, text_only=args.text_only, attr=args.attr)
85
+ output = format_output(result, as_json=args.as_json, quiet=args.quiet, label=f"select: {args.css}")
86
+
87
+ elif args.command == "strip":
88
+ result = strip.run(html, args.elements)
89
+ output = format_output(result, as_json=args.as_json, quiet=args.quiet)
90
+
91
+ elif args.command == "text":
92
+ result = text.run(html)
93
+ output = format_output(result, as_json=args.as_json, quiet=args.quiet)
94
+
95
+ else:
96
+ parser.print_help()
97
+ sys.exit(1)
98
+
99
+ print(output)
File without changes
@@ -0,0 +1,29 @@
1
+ """Extract a div by class or id."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from bs4 import BeautifulSoup
6
+
7
+
8
+ def run(html: str, name: str, by: str = "class") -> str | None:
9
+ """Extract inner HTML of a div matched by class or id.
10
+
11
+ Args:
12
+ html: HTML content to search.
13
+ name: Class name or id to match.
14
+ by: Match method — "class" or "id".
15
+
16
+ Returns:
17
+ Inner HTML of matched div, or None if not found.
18
+ """
19
+ soup = BeautifulSoup(html, "html.parser")
20
+
21
+ if by == "id":
22
+ tag = soup.find("div", id=name)
23
+ else:
24
+ tag = soup.find("div", class_=name)
25
+
26
+ if tag is None:
27
+ return None
28
+
29
+ return tag.decode_contents()
@@ -0,0 +1,34 @@
1
+ """Select elements using CSS selectors."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from bs4 import BeautifulSoup
6
+
7
+
8
+ def run(
9
+ html: str,
10
+ selector: str,
11
+ text_only: bool = False,
12
+ attr: str | None = None,
13
+ ) -> list[str]:
14
+ """Select elements matching a CSS selector.
15
+
16
+ Args:
17
+ html: HTML content to search.
18
+ selector: CSS selector string.
19
+ text_only: If True, return text content instead of HTML.
20
+ attr: If set, return this attribute's value from each match.
21
+
22
+ Returns:
23
+ List of matched element strings, text content, or attribute values.
24
+ """
25
+ soup = BeautifulSoup(html, "html.parser")
26
+ matches = soup.select(selector)
27
+
28
+ if attr:
29
+ return [tag.get(attr, "") for tag in matches]
30
+
31
+ if text_only:
32
+ return [tag.get_text(strip=True) for tag in matches]
33
+
34
+ return [str(tag) for tag in matches]
@@ -0,0 +1,24 @@
1
+ """Remove specified HTML elements and output cleaned HTML."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from bs4 import BeautifulSoup
6
+
7
+
8
+ def run(html: str, elements: list[str]) -> str:
9
+ """Remove all instances of specified elements from HTML.
10
+
11
+ Args:
12
+ html: HTML content to clean.
13
+ elements: Tag names to remove (e.g., ["script", "style"]).
14
+
15
+ Returns:
16
+ Cleaned HTML string.
17
+ """
18
+ soup = BeautifulSoup(html, "html.parser")
19
+
20
+ for tag_name in elements:
21
+ for tag in soup.find_all(tag_name):
22
+ tag.decompose()
23
+
24
+ return str(soup)
@@ -0,0 +1,27 @@
1
+ """Extract visible text from HTML."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from bs4 import BeautifulSoup
6
+
7
+
8
+ def run(html: str) -> str:
9
+ """Extract visible text content, stripping all HTML tags.
10
+
11
+ Args:
12
+ html: HTML content to extract text from.
13
+
14
+ Returns:
15
+ Visible text with whitespace normalized.
16
+ """
17
+ soup = BeautifulSoup(html, "html.parser")
18
+
19
+ # Remove script and style elements
20
+ for tag in soup(["script", "style"]):
21
+ tag.decompose()
22
+
23
+ text = soup.get_text(separator="\n")
24
+
25
+ # Normalize whitespace: strip each line, remove blank lines
26
+ lines = [line.strip() for line in text.splitlines()]
27
+ return "\n".join(line for line in lines if line)
@@ -0,0 +1,41 @@
1
+ """Output formatting helpers for JSON, quiet, and text modes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+
7
+
8
+ def format_output(
9
+ data: str | list[str] | list[dict],
10
+ *,
11
+ as_json: bool = False,
12
+ quiet: bool = False,
13
+ label: str = "",
14
+ ) -> str:
15
+ """Format command output for display.
16
+
17
+ Args:
18
+ data: Raw output — string, list of strings, or list of dicts.
19
+ as_json: Output as JSON.
20
+ quiet: Suppress labels and headers.
21
+ label: Header label (ignored if quiet).
22
+
23
+ Returns:
24
+ Formatted string ready for printing.
25
+ """
26
+ if as_json:
27
+ if isinstance(data, str):
28
+ return json.dumps({"result": data})
29
+ return json.dumps(data, indent=2)
30
+
31
+ if isinstance(data, list):
32
+ lines = []
33
+ if label and not quiet:
34
+ lines.append(f"{label} ({len(data)} found)")
35
+ for item in data:
36
+ lines.append(str(item))
37
+ return "\n".join(lines)
38
+
39
+ if label and not quiet:
40
+ return f"{label}\n{data}"
41
+ return data
@@ -0,0 +1,43 @@
1
+ """Load HTML from file, URL, or stdin."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+
7
+ import requests
8
+
9
+
10
+ def load_source(source: str | None) -> str:
11
+ """Load HTML content from a file path, URL, or stdin.
12
+
13
+ Args:
14
+ source: File path, URL (http/https), or None for stdin.
15
+
16
+ Returns:
17
+ HTML content as string.
18
+
19
+ Raises:
20
+ FileNotFoundError: If file path doesn't exist.
21
+ SystemExit: If URL fetch fails or stdin is a TTY with no data.
22
+ """
23
+ if source is None:
24
+ if sys.stdin.isatty():
25
+ print("Error: no input source. Provide a file, URL, or pipe HTML to stdin.", file=sys.stderr)
26
+ sys.exit(1)
27
+ return sys.stdin.read()
28
+
29
+ if source.startswith(("http://", "https://")):
30
+ try:
31
+ resp = requests.get(source, timeout=10)
32
+ resp.raise_for_status()
33
+ except requests.RequestException as e:
34
+ print(f"Error fetching URL: {e}", file=sys.stderr)
35
+ sys.exit(1)
36
+ return resp.text
37
+
38
+ try:
39
+ with open(source, encoding="utf-8") as f:
40
+ return f.read()
41
+ except FileNotFoundError:
42
+ print(f"Error: file not found: {source}", file=sys.stderr)
43
+ sys.exit(1)