scrape-cli 1.2.2__tar.gz → 1.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scrape_cli-1.2.2 → scrape_cli-1.2.3}/PKG-INFO +1 -1
- {scrape_cli-1.2.2 → scrape_cli-1.2.3}/pyproject.toml +1 -1
- {scrape_cli-1.2.2 → scrape_cli-1.2.3}/scrape_cli/__init__.py +1 -1
- {scrape_cli-1.2.2 → scrape_cli-1.2.3}/scrape_cli/scrape.py +25 -11
- {scrape_cli-1.2.2 → scrape_cli-1.2.3}/scrape_cli.egg-info/PKG-INFO +1 -1
- {scrape_cli-1.2.2 → scrape_cli-1.2.3}/tests/test_scrape.py +4 -4
- {scrape_cli-1.2.2 → scrape_cli-1.2.3}/README.md +0 -0
- {scrape_cli-1.2.2 → scrape_cli-1.2.3}/scrape_cli.egg-info/SOURCES.txt +0 -0
- {scrape_cli-1.2.2 → scrape_cli-1.2.3}/scrape_cli.egg-info/dependency_links.txt +0 -0
- {scrape_cli-1.2.2 → scrape_cli-1.2.3}/scrape_cli.egg-info/entry_points.txt +0 -0
- {scrape_cli-1.2.2 → scrape_cli-1.2.3}/scrape_cli.egg-info/requires.txt +0 -0
- {scrape_cli-1.2.2 → scrape_cli-1.2.3}/scrape_cli.egg-info/top_level.txt +0 -0
- {scrape_cli-1.2.2 → scrape_cli-1.2.3}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scrape_cli
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.3
|
|
4
4
|
Summary: It's a command-line tool to extract HTML elements using an XPath query or CSS3 selector.
|
|
5
5
|
Author-email: Andrea Borruso <aborruso@gmail.com>
|
|
6
6
|
Project-URL: Homepage, https://github.com/aborruso/scrape-cli
|
|
@@ -49,7 +49,7 @@ def convert_css_to_xpath(expression):
|
|
|
49
49
|
try:
|
|
50
50
|
return GenericTranslator().css_to_xpath(expression)
|
|
51
51
|
except Exception as e:
|
|
52
|
-
print(f"Error converting CSS selector to XPath: {e}")
|
|
52
|
+
print(f"Error converting CSS selector to XPath: {e}", file=sys.stderr)
|
|
53
53
|
sys.exit(1)
|
|
54
54
|
|
|
55
55
|
def is_xpath(expression):
|
|
@@ -98,7 +98,18 @@ def main():
|
|
|
98
98
|
# Command line argument parser definition
|
|
99
99
|
parser = argparse.ArgumentParser(
|
|
100
100
|
description='Extract HTML elements using an XPath query or CSS3 selector.',
|
|
101
|
-
|
|
101
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
102
|
+
epilog='''\
|
|
103
|
+
examples:
|
|
104
|
+
scrape -e "//h1" file.html XPath expression
|
|
105
|
+
scrape -e "h1.title" file.html CSS selector
|
|
106
|
+
scrape -e "//a" -a href file.html extract attribute
|
|
107
|
+
scrape -t file.html extract all text
|
|
108
|
+
scrape -e "//h1" https://example.com fetch from URL
|
|
109
|
+
cat file.html | scrape -e "//h1" read from stdin
|
|
110
|
+
scrape -e "//h1" -x file.html check existence (exit 0/1)
|
|
111
|
+
scrape -be "//article" file.html wrap output in <html><body>
|
|
112
|
+
'''
|
|
102
113
|
)
|
|
103
114
|
|
|
104
115
|
parser.add_argument('--version', action='version', version=f'%(prog)s {__version__}')
|
|
@@ -134,10 +145,13 @@ def main():
|
|
|
134
145
|
|
|
135
146
|
# Check that at least one expression is provided by the user (unless using -t option)
|
|
136
147
|
if not args.expression and not args.text:
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
"
|
|
148
|
+
print(
|
|
149
|
+
"Error: no expression specified. Use -e \"//selector\" or -t.\n"
|
|
150
|
+
" scrape -e \"//h1\" file.html\n"
|
|
151
|
+
" scrape -t file.html",
|
|
152
|
+
file=sys.stderr
|
|
140
153
|
)
|
|
154
|
+
sys.exit(1)
|
|
141
155
|
|
|
142
156
|
# Determine the source of the input: URL, file, or stdin
|
|
143
157
|
if args.html:
|
|
@@ -150,29 +164,29 @@ def main():
|
|
|
150
164
|
response.raise_for_status()
|
|
151
165
|
inp = response.content
|
|
152
166
|
except requests.RequestException as e:
|
|
153
|
-
print(f"Error downloading HTML: {e}")
|
|
167
|
+
print(f"Error downloading HTML: {e}", file=sys.stderr)
|
|
154
168
|
sys.exit(1)
|
|
155
169
|
else:
|
|
156
170
|
# If the input is a local file, try to open it
|
|
157
171
|
try:
|
|
158
172
|
inp = open(args.html, 'rb').read()
|
|
159
173
|
except FileNotFoundError:
|
|
160
|
-
print(f"Error: The file '{args.html}' was not found.")
|
|
174
|
+
print(f"Error: The file '{args.html}' was not found.", file=sys.stderr)
|
|
161
175
|
sys.exit(1)
|
|
162
176
|
else:
|
|
163
177
|
# If the input is from stdin
|
|
164
178
|
try:
|
|
165
179
|
inp = sys.stdin.buffer.read()
|
|
166
180
|
if not inp:
|
|
167
|
-
print("Error: No input received from stdin")
|
|
181
|
+
print("Error: No input received from stdin", file=sys.stderr)
|
|
168
182
|
sys.exit(1)
|
|
169
183
|
except Exception as e:
|
|
170
|
-
print(f"Error reading input: {e}")
|
|
184
|
+
print(f"Error reading input: {e}", file=sys.stderr)
|
|
171
185
|
sys.exit(1)
|
|
172
186
|
|
|
173
187
|
# Check for empty or invalid input
|
|
174
188
|
if not inp:
|
|
175
|
-
print("Error: Input is empty or invalid")
|
|
189
|
+
print("Error: Input is empty or invalid", file=sys.stderr)
|
|
176
190
|
sys.exit(1)
|
|
177
191
|
|
|
178
192
|
# Convert CSS selectors to XPath if necessary
|
|
@@ -223,7 +237,7 @@ def main():
|
|
|
223
237
|
document = etree.fromstring(inp, html_parser)
|
|
224
238
|
except (etree.XMLSyntaxError, UnicodeDecodeError) as e:
|
|
225
239
|
# Print an error in case of syntax issues in the HTML
|
|
226
|
-
print(f"Error parsing HTML: {e}")
|
|
240
|
+
print(f"Error parsing HTML: {e}", file=sys.stderr)
|
|
227
241
|
sys.exit(1)
|
|
228
242
|
|
|
229
243
|
results = []
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scrape_cli
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.3
|
|
4
4
|
Summary: It's a command-line tool to extract HTML elements using an XPath query or CSS3 selector.
|
|
5
5
|
Author-email: Andrea Borruso <aborruso@gmail.com>
|
|
6
6
|
Project-URL: Homepage, https://github.com/aborruso/scrape-cli
|
|
@@ -158,21 +158,21 @@ def test_empty_stdin_returns_error():
|
|
|
158
158
|
result = run_scrape("-e", "//p", input_data="")
|
|
159
159
|
|
|
160
160
|
assert result.returncode == 1
|
|
161
|
-
assert "Error: No input received from stdin" in result.
|
|
161
|
+
assert "Error: No input received from stdin" in result.stderr
|
|
162
162
|
|
|
163
163
|
|
|
164
164
|
def test_missing_file_returns_error():
|
|
165
165
|
result = run_scrape("resources/this-file-does-not-exist.html", "-e", "//p")
|
|
166
166
|
|
|
167
167
|
assert result.returncode == 1
|
|
168
|
-
assert "was not found" in result.
|
|
168
|
+
assert "was not found" in result.stderr
|
|
169
169
|
|
|
170
170
|
|
|
171
171
|
def test_missing_expression_without_text_returns_error():
|
|
172
172
|
result = run_scrape(str(TEST_HTML))
|
|
173
173
|
|
|
174
174
|
assert result.returncode == 1
|
|
175
|
-
assert "
|
|
175
|
+
assert "no expression specified" in result.stderr
|
|
176
176
|
|
|
177
177
|
|
|
178
178
|
def test_incorrect_eb_order_exits_with_specific_message():
|
|
@@ -186,7 +186,7 @@ def test_invalid_css_selector_fails_conversion():
|
|
|
186
186
|
result = run_scrape(str(TEST_HTML), "-e", "div[")
|
|
187
187
|
|
|
188
188
|
assert result.returncode == 1
|
|
189
|
-
assert "Error converting CSS selector to XPath" in result.
|
|
189
|
+
assert "Error converting CSS selector to XPath" in result.stderr
|
|
190
190
|
|
|
191
191
|
|
|
192
192
|
def test_url_input_downloads_and_extracts_text():
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|