scrape-cli 1.2.2__tar.gz → 1.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scrape_cli
3
- Version: 1.2.2
3
+ Version: 1.2.3
4
4
  Summary: It's a command-line tool to extract HTML elements using an XPath query or CSS3 selector.
5
5
  Author-email: Andrea Borruso <aborruso@gmail.com>
6
6
  Project-URL: Homepage, https://github.com/aborruso/scrape-cli
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "scrape_cli"
7
- version = "1.2.2"
7
+ version = "1.2.3"
8
8
  description = "It's a command-line tool to extract HTML elements using an XPath query or CSS3 selector."
9
9
  readme = "README.md"
10
10
  authors = [
@@ -4,7 +4,7 @@ scrape-cli - A command-line tool to extract HTML elements using XPath or CSS3 se
4
4
 
5
5
  from scrape_cli.scrape import main
6
6
 
7
- __version__ = "1.2.2"
7
+ __version__ = "1.2.3"
8
8
  __author__ = "Andrea Borruso"
9
9
  __author_email__ = "aborruso@gmail.com"
10
10
 
@@ -49,7 +49,7 @@ def convert_css_to_xpath(expression):
49
49
  try:
50
50
  return GenericTranslator().css_to_xpath(expression)
51
51
  except Exception as e:
52
- print(f"Error converting CSS selector to XPath: {e}")
52
+ print(f"Error converting CSS selector to XPath: {e}", file=sys.stderr)
53
53
  sys.exit(1)
54
54
 
55
55
  def is_xpath(expression):
@@ -98,7 +98,18 @@ def main():
98
98
  # Command line argument parser definition
99
99
  parser = argparse.ArgumentParser(
100
100
  description='Extract HTML elements using an XPath query or CSS3 selector.',
101
- epilog='Example: cat page.html | python scrape.py -e "//a/@href"'
101
+ formatter_class=argparse.RawDescriptionHelpFormatter,
102
+ epilog='''\
103
+ examples:
104
+ scrape -e "//h1" file.html XPath expression
105
+ scrape -e "h1.title" file.html CSS selector
106
+ scrape -e "//a" -a href file.html extract attribute
107
+ scrape -t file.html extract all text
108
+ scrape -e "//h1" https://example.com fetch from URL
109
+ cat file.html | scrape -e "//h1" read from stdin
110
+ scrape -e "//h1" -x file.html check existence (exit 0/1)
111
+ scrape -be "//article" file.html wrap output in <html><body>
112
+ '''
102
113
  )
103
114
 
104
115
  parser.add_argument('--version', action='version', version=f'%(prog)s {__version__}')
@@ -134,10 +145,13 @@ def main():
134
145
 
135
146
  # Check that at least one expression is provided by the user (unless using -t option)
136
147
  if not args.expression and not args.text:
137
- parser.print_help()
138
- sys.exit(
139
- "Error: you must provide at least one XPath query or CSS3 selector using the -e option, or use -t to extract text."
148
+ print(
149
+ "Error: no expression specified. Use -e \"//selector\" or -t.\n"
150
+ " scrape -e \"//h1\" file.html\n"
151
+ " scrape -t file.html",
152
+ file=sys.stderr
140
153
  )
154
+ sys.exit(1)
141
155
 
142
156
  # Determine the source of the input: URL, file, or stdin
143
157
  if args.html:
@@ -150,29 +164,29 @@ def main():
150
164
  response.raise_for_status()
151
165
  inp = response.content
152
166
  except requests.RequestException as e:
153
- print(f"Error downloading HTML: {e}")
167
+ print(f"Error downloading HTML: {e}", file=sys.stderr)
154
168
  sys.exit(1)
155
169
  else:
156
170
  # If the input is a local file, try to open it
157
171
  try:
158
172
  inp = open(args.html, 'rb').read()
159
173
  except FileNotFoundError:
160
- print(f"Error: The file '{args.html}' was not found.")
174
+ print(f"Error: The file '{args.html}' was not found.", file=sys.stderr)
161
175
  sys.exit(1)
162
176
  else:
163
177
  # If the input is from stdin
164
178
  try:
165
179
  inp = sys.stdin.buffer.read()
166
180
  if not inp:
167
- print("Error: No input received from stdin")
181
+ print("Error: No input received from stdin", file=sys.stderr)
168
182
  sys.exit(1)
169
183
  except Exception as e:
170
- print(f"Error reading input: {e}")
184
+ print(f"Error reading input: {e}", file=sys.stderr)
171
185
  sys.exit(1)
172
186
 
173
187
  # Check for empty or invalid input
174
188
  if not inp:
175
- print("Error: Input is empty or invalid")
189
+ print("Error: Input is empty or invalid", file=sys.stderr)
176
190
  sys.exit(1)
177
191
 
178
192
  # Convert CSS selectors to XPath if necessary
@@ -223,7 +237,7 @@ def main():
223
237
  document = etree.fromstring(inp, html_parser)
224
238
  except (etree.XMLSyntaxError, UnicodeDecodeError) as e:
225
239
  # Print an error in case of syntax issues in the HTML
226
- print(f"Error parsing HTML: {e}")
240
+ print(f"Error parsing HTML: {e}", file=sys.stderr)
227
241
  sys.exit(1)
228
242
 
229
243
  results = []
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scrape_cli
3
- Version: 1.2.2
3
+ Version: 1.2.3
4
4
  Summary: It's a command-line tool to extract HTML elements using an XPath query or CSS3 selector.
5
5
  Author-email: Andrea Borruso <aborruso@gmail.com>
6
6
  Project-URL: Homepage, https://github.com/aborruso/scrape-cli
@@ -158,21 +158,21 @@ def test_empty_stdin_returns_error():
158
158
  result = run_scrape("-e", "//p", input_data="")
159
159
 
160
160
  assert result.returncode == 1
161
- assert "Error: No input received from stdin" in result.stdout
161
+ assert "Error: No input received from stdin" in result.stderr
162
162
 
163
163
 
164
164
  def test_missing_file_returns_error():
165
165
  result = run_scrape("resources/this-file-does-not-exist.html", "-e", "//p")
166
166
 
167
167
  assert result.returncode == 1
168
- assert "was not found" in result.stdout
168
+ assert "was not found" in result.stderr
169
169
 
170
170
 
171
171
  def test_missing_expression_without_text_returns_error():
172
172
  result = run_scrape(str(TEST_HTML))
173
173
 
174
174
  assert result.returncode == 1
175
- assert "you must provide at least one XPath query or CSS3 selector" in result.stderr
175
+ assert "no expression specified" in result.stderr
176
176
 
177
177
 
178
178
  def test_incorrect_eb_order_exits_with_specific_message():
@@ -186,7 +186,7 @@ def test_invalid_css_selector_fails_conversion():
186
186
  result = run_scrape(str(TEST_HTML), "-e", "div[")
187
187
 
188
188
  assert result.returncode == 1
189
- assert "Error converting CSS selector to XPath" in result.stdout
189
+ assert "Error converting CSS selector to XPath" in result.stderr
190
190
 
191
191
 
192
192
  def test_url_input_downloads_and_extracts_text():
File without changes
File without changes