scrape-cli 1.2.0__tar.gz → 1.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scrape_cli-1.2.0 → scrape_cli-1.2.2}/PKG-INFO +67 -9
- {scrape_cli-1.2.0 → scrape_cli-1.2.2}/README.md +65 -2
- {scrape_cli-1.2.0 → scrape_cli-1.2.2}/pyproject.toml +2 -2
- {scrape_cli-1.2.0 → scrape_cli-1.2.2}/scrape_cli/__init__.py +1 -1
- {scrape_cli-1.2.0 → scrape_cli-1.2.2}/scrape_cli/scrape.py +10 -6
- {scrape_cli-1.2.0 → scrape_cli-1.2.2}/scrape_cli.egg-info/PKG-INFO +67 -9
- {scrape_cli-1.2.0 → scrape_cli-1.2.2}/scrape_cli.egg-info/SOURCES.txt +2 -2
- scrape_cli-1.2.2/tests/test_scrape.py +205 -0
- scrape_cli-1.2.0/setup.py +0 -37
- {scrape_cli-1.2.0 → scrape_cli-1.2.2}/scrape_cli.egg-info/dependency_links.txt +0 -0
- {scrape_cli-1.2.0 → scrape_cli-1.2.2}/scrape_cli.egg-info/entry_points.txt +0 -0
- {scrape_cli-1.2.0 → scrape_cli-1.2.2}/scrape_cli.egg-info/requires.txt +0 -0
- {scrape_cli-1.2.0 → scrape_cli-1.2.2}/scrape_cli.egg-info/top_level.txt +0 -0
- {scrape_cli-1.2.0 → scrape_cli-1.2.2}/setup.cfg +0 -0
|
@@ -1,24 +1,20 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scrape_cli
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.2
|
|
4
4
|
Summary: It's a command-line tool to extract HTML elements using an XPath query or CSS3 selector.
|
|
5
|
-
Home-page: https://github.com/aborruso/scrape-cli
|
|
6
|
-
Author: Andrea Borruso
|
|
7
5
|
Author-email: Andrea Borruso <aborruso@gmail.com>
|
|
8
6
|
Project-URL: Homepage, https://github.com/aborruso/scrape-cli
|
|
9
7
|
Classifier: Programming Language :: Python :: 3
|
|
10
8
|
Classifier: Operating System :: OS Independent
|
|
11
|
-
Requires-Python: >=3.
|
|
9
|
+
Requires-Python: >=3.8
|
|
12
10
|
Description-Content-Type: text/markdown
|
|
13
11
|
Requires-Dist: cssselect
|
|
14
12
|
Requires-Dist: lxml
|
|
15
13
|
Requires-Dist: requests
|
|
16
|
-
Dynamic: author
|
|
17
|
-
Dynamic: home-page
|
|
18
|
-
Dynamic: requires-python
|
|
19
14
|
|
|
20
15
|
[](https://pypi.org/project/scrape-cli/)
|
|
21
16
|
[](https://pypi.org/project/scrape-cli/)
|
|
17
|
+
[](https://deepwiki.com/aborruso/scrape-cli)
|
|
22
18
|
|
|
23
19
|
# scrape cli
|
|
24
20
|
|
|
@@ -52,7 +48,7 @@ uv tool install scrape-cli
|
|
|
52
48
|
uv pip install scrape-cli
|
|
53
49
|
|
|
54
50
|
# Or run temporarily without installing
|
|
55
|
-
uvx scrape-cli --help
|
|
51
|
+
uvx --from scrape-cli scrape --help
|
|
56
52
|
```
|
|
57
53
|
|
|
58
54
|
### Using pip
|
|
@@ -80,7 +76,15 @@ pip install -e .
|
|
|
80
76
|
|
|
81
77
|
### Using the Test HTML File
|
|
82
78
|
|
|
83
|
-
In the `resources` directory you'll find a `test.html` file that you can use to test various scraping scenarios.
|
|
79
|
+
In the `resources` directory you'll find a `test.html` file that you can use to test various scraping scenarios.
|
|
80
|
+
|
|
81
|
+
**Note**: You can also test directly from the URL without cloning the repository:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
scrape -e "h1" https://raw.githubusercontent.com/aborruso/scrape-cli/refs/heads/master/resources/test.html
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Here are some examples:
|
|
84
88
|
|
|
85
89
|
1. Extract all table data:
|
|
86
90
|
|
|
@@ -226,6 +230,60 @@ scrape -te 'h1, h2, h3' resources/test.html
|
|
|
226
230
|
|
|
227
231
|
The `-t` option automatically excludes text from `<script>` and `<style>` tags and cleans up whitespace for better readability.
|
|
228
232
|
|
|
233
|
+
### JSON Output Integration
|
|
234
|
+
|
|
235
|
+
You can integrate scrape-cli with [xq](https://github.com/kislyuk/yq) (part of yq) to convert HTML output to structured JSON:
|
|
236
|
+
|
|
237
|
+
```bash
|
|
238
|
+
# Extract and convert to JSON (requires -b for complete HTML)
|
|
239
|
+
scrape -be "a.external-link" resources/test.html | xq .
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
Output:
|
|
243
|
+
|
|
244
|
+
```json
|
|
245
|
+
{
|
|
246
|
+
"html": {
|
|
247
|
+
"body": {
|
|
248
|
+
"a": {
|
|
249
|
+
"@href": "https://example.com",
|
|
250
|
+
"@class": "external-link",
|
|
251
|
+
"#text": "Example Link"
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
Table extraction example:
|
|
259
|
+
|
|
260
|
+
```bash
|
|
261
|
+
scrape -be "table.data-table td" resources/test.html | xq .
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
Output:
|
|
265
|
+
|
|
266
|
+
```json
|
|
267
|
+
{
|
|
268
|
+
"html": {
|
|
269
|
+
"body": {
|
|
270
|
+
"td": [
|
|
271
|
+
"1",
|
|
272
|
+
"John Doe",
|
|
273
|
+
"john@example.com",
|
|
274
|
+
"2",
|
|
275
|
+
"Jane Smith",
|
|
276
|
+
"jane@example.com"
|
|
277
|
+
]
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
**Note**: The `-b` flag is mandatory to produce valid HTML with `<html>`, `<head>` and `<body>` tags.
|
|
284
|
+
|
|
285
|
+
Useful for JSON-based pipelines, APIs, databases, and processing with jq/DuckDB.
|
|
286
|
+
|
|
229
287
|
Some notes on the commands:
|
|
230
288
|
|
|
231
289
|
- `-e` to set the query
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
[](https://pypi.org/project/scrape-cli/)
|
|
2
2
|
[](https://pypi.org/project/scrape-cli/)
|
|
3
|
+
[](https://deepwiki.com/aborruso/scrape-cli)
|
|
3
4
|
|
|
4
5
|
# scrape cli
|
|
5
6
|
|
|
@@ -33,7 +34,7 @@ uv tool install scrape-cli
|
|
|
33
34
|
uv pip install scrape-cli
|
|
34
35
|
|
|
35
36
|
# Or run temporarily without installing
|
|
36
|
-
uvx scrape-cli --help
|
|
37
|
+
uvx --from scrape-cli scrape --help
|
|
37
38
|
```
|
|
38
39
|
|
|
39
40
|
### Using pip
|
|
@@ -61,7 +62,15 @@ pip install -e .
|
|
|
61
62
|
|
|
62
63
|
### Using the Test HTML File
|
|
63
64
|
|
|
64
|
-
In the `resources` directory you'll find a `test.html` file that you can use to test various scraping scenarios.
|
|
65
|
+
In the `resources` directory you'll find a `test.html` file that you can use to test various scraping scenarios.
|
|
66
|
+
|
|
67
|
+
**Note**: You can also test directly from the URL without cloning the repository:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
scrape -e "h1" https://raw.githubusercontent.com/aborruso/scrape-cli/refs/heads/master/resources/test.html
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Here are some examples:
|
|
65
74
|
|
|
66
75
|
1. Extract all table data:
|
|
67
76
|
|
|
@@ -207,6 +216,60 @@ scrape -te 'h1, h2, h3' resources/test.html
|
|
|
207
216
|
|
|
208
217
|
The `-t` option automatically excludes text from `<script>` and `<style>` tags and cleans up whitespace for better readability.
|
|
209
218
|
|
|
219
|
+
### JSON Output Integration
|
|
220
|
+
|
|
221
|
+
You can integrate scrape-cli with [xq](https://github.com/kislyuk/yq) (part of yq) to convert HTML output to structured JSON:
|
|
222
|
+
|
|
223
|
+
```bash
|
|
224
|
+
# Extract and convert to JSON (requires -b for complete HTML)
|
|
225
|
+
scrape -be "a.external-link" resources/test.html | xq .
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
Output:
|
|
229
|
+
|
|
230
|
+
```json
|
|
231
|
+
{
|
|
232
|
+
"html": {
|
|
233
|
+
"body": {
|
|
234
|
+
"a": {
|
|
235
|
+
"@href": "https://example.com",
|
|
236
|
+
"@class": "external-link",
|
|
237
|
+
"#text": "Example Link"
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
Table extraction example:
|
|
245
|
+
|
|
246
|
+
```bash
|
|
247
|
+
scrape -be "table.data-table td" resources/test.html | xq .
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
Output:
|
|
251
|
+
|
|
252
|
+
```json
|
|
253
|
+
{
|
|
254
|
+
"html": {
|
|
255
|
+
"body": {
|
|
256
|
+
"td": [
|
|
257
|
+
"1",
|
|
258
|
+
"John Doe",
|
|
259
|
+
"john@example.com",
|
|
260
|
+
"2",
|
|
261
|
+
"Jane Smith",
|
|
262
|
+
"jane@example.com"
|
|
263
|
+
]
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
**Note**: The `-b` flag is mandatory to produce valid HTML with `<html>`, `<head>` and `<body>` tags.
|
|
270
|
+
|
|
271
|
+
Useful for JSON-based pipelines, APIs, databases, and processing with jq/DuckDB.
|
|
272
|
+
|
|
210
273
|
Some notes on the commands:
|
|
211
274
|
|
|
212
275
|
- `-e` to set the query
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "scrape_cli"
|
|
7
|
-
version = "1.2.
|
|
7
|
+
version = "1.2.2"
|
|
8
8
|
description = "It's a command-line tool to extract HTML elements using an XPath query or CSS3 selector."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
authors = [
|
|
@@ -14,7 +14,7 @@ classifiers = [
|
|
|
14
14
|
"Programming Language :: Python :: 3",
|
|
15
15
|
"Operating System :: OS Independent",
|
|
16
16
|
]
|
|
17
|
-
requires-python = ">=3.
|
|
17
|
+
requires-python = ">=3.8"
|
|
18
18
|
dependencies = [
|
|
19
19
|
"cssselect",
|
|
20
20
|
"lxml",
|
|
@@ -63,13 +63,13 @@ def is_xpath(expression):
|
|
|
63
63
|
- Expressions wrapped in parentheses that contain XPath syntax
|
|
64
64
|
"""
|
|
65
65
|
expr = expression.strip()
|
|
66
|
-
|
|
66
|
+
|
|
67
67
|
# Direct XPath patterns
|
|
68
68
|
if expr.startswith('/') or expr.startswith('//'):
|
|
69
69
|
return True
|
|
70
70
|
if '::' in expr:
|
|
71
71
|
return True
|
|
72
|
-
|
|
72
|
+
|
|
73
73
|
# Handle expressions wrapped in parentheses
|
|
74
74
|
if expr.startswith('(') and expr.endswith(')'):
|
|
75
75
|
# Remove outer parentheses and check inner content
|
|
@@ -78,7 +78,7 @@ def is_xpath(expression):
|
|
|
78
78
|
return True
|
|
79
79
|
if '::' in inner_expr:
|
|
80
80
|
return True
|
|
81
|
-
|
|
81
|
+
|
|
82
82
|
# Additional XPath indicators
|
|
83
83
|
# Check for XPath-specific patterns that CSS doesn't have
|
|
84
84
|
if '//' in expr or expr.startswith('/'):
|
|
@@ -91,7 +91,7 @@ def is_xpath(expression):
|
|
|
91
91
|
return True
|
|
92
92
|
if re.search(r'\b(ancestor|descendant|following|preceding|parent|child)::', expr): # XPath axes
|
|
93
93
|
return True
|
|
94
|
-
|
|
94
|
+
|
|
95
95
|
return False
|
|
96
96
|
|
|
97
97
|
def main():
|
|
@@ -128,6 +128,8 @@ def main():
|
|
|
128
128
|
parser.add_argument('-r', '--rawinput', action='store_true', default=False,
|
|
129
129
|
help="Do not parse HTML before passing to etree (useful for CData)")
|
|
130
130
|
parser.add_argument('--check-existence', dest='check_existence', action='store_true')
|
|
131
|
+
parser.add_argument('-u', '--user-agent', default=None,
|
|
132
|
+
help="Custom User-Agent string for HTTP requests")
|
|
131
133
|
args = parser.parse_args()
|
|
132
134
|
|
|
133
135
|
# Check that at least one expression is provided by the user (unless using -t option)
|
|
@@ -142,7 +144,9 @@ def main():
|
|
|
142
144
|
if args.html.startswith('http://') or args.html.startswith('https://'):
|
|
143
145
|
# If the input is a URL, download the HTML content
|
|
144
146
|
try:
|
|
145
|
-
|
|
147
|
+
ua = args.user_agent or "Mozilla/5.0 (compatible; scrape-cli/1.0)"
|
|
148
|
+
headers = {"User-Agent": ua}
|
|
149
|
+
response = requests.get(args.html, headers=headers, timeout=30)
|
|
146
150
|
response.raise_for_status()
|
|
147
151
|
inp = response.content
|
|
148
152
|
except requests.RequestException as e:
|
|
@@ -189,7 +193,7 @@ def main():
|
|
|
189
193
|
meta = re.search(r'<meta[^>]+charset=["\']?([\w-]+)', head)
|
|
190
194
|
if meta:
|
|
191
195
|
return meta.group(1)
|
|
192
|
-
except:
|
|
196
|
+
except Exception:
|
|
193
197
|
pass
|
|
194
198
|
return None
|
|
195
199
|
|
|
@@ -1,24 +1,20 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scrape_cli
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.2
|
|
4
4
|
Summary: It's a command-line tool to extract HTML elements using an XPath query or CSS3 selector.
|
|
5
|
-
Home-page: https://github.com/aborruso/scrape-cli
|
|
6
|
-
Author: Andrea Borruso
|
|
7
5
|
Author-email: Andrea Borruso <aborruso@gmail.com>
|
|
8
6
|
Project-URL: Homepage, https://github.com/aborruso/scrape-cli
|
|
9
7
|
Classifier: Programming Language :: Python :: 3
|
|
10
8
|
Classifier: Operating System :: OS Independent
|
|
11
|
-
Requires-Python: >=3.
|
|
9
|
+
Requires-Python: >=3.8
|
|
12
10
|
Description-Content-Type: text/markdown
|
|
13
11
|
Requires-Dist: cssselect
|
|
14
12
|
Requires-Dist: lxml
|
|
15
13
|
Requires-Dist: requests
|
|
16
|
-
Dynamic: author
|
|
17
|
-
Dynamic: home-page
|
|
18
|
-
Dynamic: requires-python
|
|
19
14
|
|
|
20
15
|
[](https://pypi.org/project/scrape-cli/)
|
|
21
16
|
[](https://pypi.org/project/scrape-cli/)
|
|
17
|
+
[](https://deepwiki.com/aborruso/scrape-cli)
|
|
22
18
|
|
|
23
19
|
# scrape cli
|
|
24
20
|
|
|
@@ -52,7 +48,7 @@ uv tool install scrape-cli
|
|
|
52
48
|
uv pip install scrape-cli
|
|
53
49
|
|
|
54
50
|
# Or run temporarily without installing
|
|
55
|
-
uvx scrape-cli --help
|
|
51
|
+
uvx --from scrape-cli scrape --help
|
|
56
52
|
```
|
|
57
53
|
|
|
58
54
|
### Using pip
|
|
@@ -80,7 +76,15 @@ pip install -e .
|
|
|
80
76
|
|
|
81
77
|
### Using the Test HTML File
|
|
82
78
|
|
|
83
|
-
In the `resources` directory you'll find a `test.html` file that you can use to test various scraping scenarios.
|
|
79
|
+
In the `resources` directory you'll find a `test.html` file that you can use to test various scraping scenarios.
|
|
80
|
+
|
|
81
|
+
**Note**: You can also test directly from the URL without cloning the repository:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
scrape -e "h1" https://raw.githubusercontent.com/aborruso/scrape-cli/refs/heads/master/resources/test.html
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Here are some examples:
|
|
84
88
|
|
|
85
89
|
1. Extract all table data:
|
|
86
90
|
|
|
@@ -226,6 +230,60 @@ scrape -te 'h1, h2, h3' resources/test.html
|
|
|
226
230
|
|
|
227
231
|
The `-t` option automatically excludes text from `<script>` and `<style>` tags and cleans up whitespace for better readability.
|
|
228
232
|
|
|
233
|
+
### JSON Output Integration
|
|
234
|
+
|
|
235
|
+
You can integrate scrape-cli with [xq](https://github.com/kislyuk/yq) (part of yq) to convert HTML output to structured JSON:
|
|
236
|
+
|
|
237
|
+
```bash
|
|
238
|
+
# Extract and convert to JSON (requires -b for complete HTML)
|
|
239
|
+
scrape -be "a.external-link" resources/test.html | xq .
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
Output:
|
|
243
|
+
|
|
244
|
+
```json
|
|
245
|
+
{
|
|
246
|
+
"html": {
|
|
247
|
+
"body": {
|
|
248
|
+
"a": {
|
|
249
|
+
"@href": "https://example.com",
|
|
250
|
+
"@class": "external-link",
|
|
251
|
+
"#text": "Example Link"
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
Table extraction example:
|
|
259
|
+
|
|
260
|
+
```bash
|
|
261
|
+
scrape -be "table.data-table td" resources/test.html | xq .
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
Output:
|
|
265
|
+
|
|
266
|
+
```json
|
|
267
|
+
{
|
|
268
|
+
"html": {
|
|
269
|
+
"body": {
|
|
270
|
+
"td": [
|
|
271
|
+
"1",
|
|
272
|
+
"John Doe",
|
|
273
|
+
"john@example.com",
|
|
274
|
+
"2",
|
|
275
|
+
"Jane Smith",
|
|
276
|
+
"jane@example.com"
|
|
277
|
+
]
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
**Note**: The `-b` flag is mandatory to produce valid HTML with `<html>`, `<head>` and `<body>` tags.
|
|
284
|
+
|
|
285
|
+
Useful for JSON-based pipelines, APIs, databases, and processing with jq/DuckDB.
|
|
286
|
+
|
|
229
287
|
Some notes on the commands:
|
|
230
288
|
|
|
231
289
|
- `-e` to set the query
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
README.md
|
|
2
2
|
pyproject.toml
|
|
3
|
-
setup.py
|
|
4
3
|
scrape_cli/__init__.py
|
|
5
4
|
scrape_cli/scrape.py
|
|
6
5
|
scrape_cli.egg-info/PKG-INFO
|
|
@@ -8,4 +7,5 @@ scrape_cli.egg-info/SOURCES.txt
|
|
|
8
7
|
scrape_cli.egg-info/dependency_links.txt
|
|
9
8
|
scrape_cli.egg-info/entry_points.txt
|
|
10
9
|
scrape_cli.egg-info/requires.txt
|
|
11
|
-
scrape_cli.egg-info/top_level.txt
|
|
10
|
+
scrape_cli.egg-info/top_level.txt
|
|
11
|
+
tests/test_scrape.py
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import sys
|
|
3
|
+
import threading
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from http.server import BaseHTTPRequestHandler, HTTPServer
|
|
6
|
+
|
|
7
|
+
ROOT = Path(__file__).resolve().parents[1]
|
|
8
|
+
TEST_HTML = ROOT / "resources" / "test.html"
|
|
9
|
+
sys.path.insert(0, str(ROOT))
|
|
10
|
+
|
|
11
|
+
from scrape_cli.scrape import is_xpath
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def run_scrape(*args, input_data=None):
|
|
15
|
+
cmd = [sys.executable, "-m", "scrape_cli.scrape", *args]
|
|
16
|
+
return subprocess.run(
|
|
17
|
+
cmd,
|
|
18
|
+
capture_output=True,
|
|
19
|
+
text=True,
|
|
20
|
+
cwd=ROOT,
|
|
21
|
+
input=input_data,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def run_test_server(html_bytes):
|
|
26
|
+
class Handler(BaseHTTPRequestHandler):
|
|
27
|
+
def do_GET(self):
|
|
28
|
+
self.send_response(200)
|
|
29
|
+
self.send_header("Content-Type", "text/html; charset=utf-8")
|
|
30
|
+
self.end_headers()
|
|
31
|
+
self.wfile.write(html_bytes)
|
|
32
|
+
|
|
33
|
+
def log_message(self, format, *args):
|
|
34
|
+
return
|
|
35
|
+
|
|
36
|
+
server = HTTPServer(("127.0.0.1", 0), Handler)
|
|
37
|
+
thread = threading.Thread(target=server.serve_forever, daemon=True)
|
|
38
|
+
thread.start()
|
|
39
|
+
return server, thread
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_is_xpath_true_patterns():
|
|
43
|
+
candidates = [
|
|
44
|
+
"//div",
|
|
45
|
+
"/html/body/div",
|
|
46
|
+
"(//div)[1]",
|
|
47
|
+
"//a/@href",
|
|
48
|
+
"//li[2]",
|
|
49
|
+
"ancestor::div",
|
|
50
|
+
"descendant::span",
|
|
51
|
+
"//p/text()",
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
for expression in candidates:
|
|
55
|
+
assert is_xpath(expression) is True
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_is_xpath_false_css_patterns():
|
|
59
|
+
candidates = [
|
|
60
|
+
"div.content > a.link",
|
|
61
|
+
"a[href*='/about']",
|
|
62
|
+
"input[type='email']",
|
|
63
|
+
"ul.items-list li:first-child",
|
|
64
|
+
"div.class1.class2",
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
for expression in candidates:
|
|
68
|
+
assert is_xpath(expression) is False
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def test_xpath_parentheses_extracts_first_match():
|
|
72
|
+
result = run_scrape(str(TEST_HTML), "-e", "(//ul[@class='items-list']/li)[1]", "-t")
|
|
73
|
+
|
|
74
|
+
assert result.returncode == 0
|
|
75
|
+
assert result.stdout.strip() == "First item"
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def test_css_attribute_selector_is_not_misclassified_as_xpath():
|
|
79
|
+
result = run_scrape(str(TEST_HTML), "-e", ".resource-links a[href*='github.com']", "-t")
|
|
80
|
+
|
|
81
|
+
assert result.returncode == 0
|
|
82
|
+
assert result.stdout.strip() == "GitHub Repository"
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def test_check_existence_true_and_false():
|
|
86
|
+
found = run_scrape(str(TEST_HTML), "-e", "//h1", "--check-existence")
|
|
87
|
+
missing = run_scrape(str(TEST_HTML), "-e", "//this-node-does-not-exist", "--check-existence")
|
|
88
|
+
|
|
89
|
+
assert found.returncode == 0
|
|
90
|
+
assert missing.returncode == 1
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def test_encoding_meta_charset_iso_8859_1(tmp_path):
|
|
94
|
+
html = """<!doctype html>
|
|
95
|
+
<html>
|
|
96
|
+
<head><meta charset=\"iso-8859-1\"></head>
|
|
97
|
+
<body><p>Perch\xe9</p></body>
|
|
98
|
+
</html>
|
|
99
|
+
""".encode("iso-8859-1")
|
|
100
|
+
sample = tmp_path / "latin1.html"
|
|
101
|
+
sample.write_bytes(html)
|
|
102
|
+
|
|
103
|
+
result = run_scrape(str(sample), "-e", "//p/text()", "-t")
|
|
104
|
+
|
|
105
|
+
assert result.returncode == 0
|
|
106
|
+
assert result.stdout.strip() == "Perché"
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def test_argument_extracts_attribute_value():
|
|
110
|
+
result = run_scrape(str(TEST_HTML), "-e", "//a[@class='external-link']", "-a", "href")
|
|
111
|
+
|
|
112
|
+
assert result.returncode == 0
|
|
113
|
+
assert result.stdout.strip() == "https://example.com"
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def test_body_flag_wraps_output_in_html_body():
|
|
117
|
+
result = run_scrape(str(TEST_HTML), "-e", "//h1", "-b")
|
|
118
|
+
|
|
119
|
+
assert result.returncode == 0
|
|
120
|
+
assert result.stdout.startswith("<!DOCTYPE html>\n<html>\n<body>\n")
|
|
121
|
+
assert result.stdout.strip().endswith("</body>\n</html>")
|
|
122
|
+
assert "<h1 id=\"main-title\">Welcome to the Test Page</h1>" in result.stdout
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def test_text_flag_without_expression_extracts_body_and_skips_script():
|
|
126
|
+
result = run_scrape(str(TEST_HTML), "-t")
|
|
127
|
+
|
|
128
|
+
assert result.returncode == 0
|
|
129
|
+
assert "Welcome to the Test Page" in result.stdout
|
|
130
|
+
assert "document.getElementById('dynamic-content')" not in result.stdout
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def test_short_check_existence_flag_x():
|
|
134
|
+
found = run_scrape(str(TEST_HTML), "-e", "//table", "-x")
|
|
135
|
+
missing = run_scrape(str(TEST_HTML), "-e", "//definitely-not-here", "-x")
|
|
136
|
+
|
|
137
|
+
assert found.returncode == 0
|
|
138
|
+
assert missing.returncode == 1
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def test_rawinput_parses_xml_without_html_parser():
|
|
142
|
+
xml_data = "<root><item>one</item><item>two</item></root>"
|
|
143
|
+
result = run_scrape("-e", "//item[2]/text()", "-r", input_data=xml_data)
|
|
144
|
+
|
|
145
|
+
assert result.returncode == 0
|
|
146
|
+
assert result.stdout.strip() == "two"
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def test_stdin_input_works_when_no_html_argument():
|
|
150
|
+
html_data = "<html><body><p>stdin-ok</p></body></html>"
|
|
151
|
+
result = run_scrape("-e", "//p/text()", "-t", input_data=html_data)
|
|
152
|
+
|
|
153
|
+
assert result.returncode == 0
|
|
154
|
+
assert result.stdout.strip() == "stdin-ok"
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def test_empty_stdin_returns_error():
|
|
158
|
+
result = run_scrape("-e", "//p", input_data="")
|
|
159
|
+
|
|
160
|
+
assert result.returncode == 1
|
|
161
|
+
assert "Error: No input received from stdin" in result.stdout
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def test_missing_file_returns_error():
|
|
165
|
+
result = run_scrape("resources/this-file-does-not-exist.html", "-e", "//p")
|
|
166
|
+
|
|
167
|
+
assert result.returncode == 1
|
|
168
|
+
assert "was not found" in result.stdout
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def test_missing_expression_without_text_returns_error():
|
|
172
|
+
result = run_scrape(str(TEST_HTML))
|
|
173
|
+
|
|
174
|
+
assert result.returncode == 1
|
|
175
|
+
assert "you must provide at least one XPath query or CSS3 selector" in result.stderr
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def test_incorrect_eb_order_exits_with_specific_message():
|
|
179
|
+
result = run_scrape("-eb")
|
|
180
|
+
|
|
181
|
+
assert result.returncode == 1
|
|
182
|
+
assert "Please use -be instead of -eb." in result.stderr
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def test_invalid_css_selector_fails_conversion():
|
|
186
|
+
result = run_scrape(str(TEST_HTML), "-e", "div[")
|
|
187
|
+
|
|
188
|
+
assert result.returncode == 1
|
|
189
|
+
assert "Error converting CSS selector to XPath" in result.stdout
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def test_url_input_downloads_and_extracts_text():
|
|
193
|
+
html_bytes = TEST_HTML.read_bytes()
|
|
194
|
+
server, thread = run_test_server(html_bytes)
|
|
195
|
+
|
|
196
|
+
try:
|
|
197
|
+
url = f"http://127.0.0.1:{server.server_address[1]}"
|
|
198
|
+
result = run_scrape(url, "-e", "//h1/text()", "-t")
|
|
199
|
+
finally:
|
|
200
|
+
server.shutdown()
|
|
201
|
+
server.server_close()
|
|
202
|
+
thread.join(timeout=2)
|
|
203
|
+
|
|
204
|
+
assert result.returncode == 0
|
|
205
|
+
assert result.stdout.strip() == "Welcome to the Test Page"
|
scrape_cli-1.2.0/setup.py
DELETED
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
# setup.py
|
|
2
|
-
from setuptools import setup
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
# Leggi il README
|
|
6
|
-
this_directory = Path(__file__).parent
|
|
7
|
-
long_description = (this_directory / "README.md").read_text(encoding="utf-8")
|
|
8
|
-
|
|
9
|
-
setup(
|
|
10
|
-
name="scrape_cli",
|
|
11
|
-
version="1.1.9",
|
|
12
|
-
description="It's a command-line tool to extract HTML elements using an XPath query or CSS3 selector.",
|
|
13
|
-
long_description=long_description,
|
|
14
|
-
long_description_content_type="text/markdown",
|
|
15
|
-
author="Andrea Borruso",
|
|
16
|
-
author_email="aborruso@gmail.com",
|
|
17
|
-
url="https://github.com/aborruso/scrape-cli",
|
|
18
|
-
license="MIT",
|
|
19
|
-
packages=["scrape_cli"],
|
|
20
|
-
package_dir={"scrape_cli": "scrape_cli"},
|
|
21
|
-
entry_points={
|
|
22
|
-
'console_scripts': [
|
|
23
|
-
'scrape=scrape_cli.scrape:main',
|
|
24
|
-
],
|
|
25
|
-
},
|
|
26
|
-
install_requires=[
|
|
27
|
-
"cssselect",
|
|
28
|
-
"lxml",
|
|
29
|
-
"requests"
|
|
30
|
-
],
|
|
31
|
-
classifiers=[
|
|
32
|
-
"Programming Language :: Python :: 3",
|
|
33
|
-
"License :: OSI Approved :: MIT License",
|
|
34
|
-
"Operating System :: OS Independent",
|
|
35
|
-
],
|
|
36
|
-
python_requires='>=3.6',
|
|
37
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|