ezextract 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ezextract/__init__.py +601 -0
- ezextract/utils.py +27 -0
- ezextract-0.1.1.dist-info/METADATA +174 -0
- ezextract-0.1.1.dist-info/RECORD +6 -0
- ezextract-0.1.1.dist-info/WHEEL +4 -0
- ezextract-0.1.1.dist-info/licenses/LICENSE +11 -0
ezextract/__init__.py
ADDED
|
@@ -0,0 +1,601 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import os
|
|
3
|
+
import httpx
|
|
4
|
+
import threading
|
|
5
|
+
import logging
|
|
6
|
+
from bs4 import BeautifulSoup
|
|
7
|
+
from urllib.parse import urljoin
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
9
|
+
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
|
|
10
|
+
|
|
11
|
+
from .utils import clean_text, save_csv, save_json
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Scraper:
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
url=None,
|
|
18
|
+
delay=0.0,
|
|
19
|
+
headers=None,
|
|
20
|
+
max_requests_per_minute=None,
|
|
21
|
+
debug=False,
|
|
22
|
+
strict=False,
|
|
23
|
+
):
|
|
24
|
+
self._playwright = None
|
|
25
|
+
self._browser = None
|
|
26
|
+
self._context = None
|
|
27
|
+
|
|
28
|
+
self.base_url = url
|
|
29
|
+
self.delay = delay
|
|
30
|
+
self.debug = debug
|
|
31
|
+
self.strict = strict
|
|
32
|
+
|
|
33
|
+
self._state = threading.local()
|
|
34
|
+
self._lock = threading.Lock()
|
|
35
|
+
|
|
36
|
+
# Setup logging
|
|
37
|
+
self.logger = logging.getLogger(__name__)
|
|
38
|
+
if debug:
|
|
39
|
+
self.logger.setLevel(logging.DEBUG)
|
|
40
|
+
else:
|
|
41
|
+
self.logger.setLevel(logging.WARNING)
|
|
42
|
+
|
|
43
|
+
# headers
|
|
44
|
+
head = {
|
|
45
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
46
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
47
|
+
"Chrome/121.0.0.0 Safari/537.36",
|
|
48
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
49
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
50
|
+
"Referer": "https://www.google.com/",
|
|
51
|
+
"Connection": "keep-alive",
|
|
52
|
+
"Upgrade-Insecure-Requests": "1",
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
if headers:
|
|
56
|
+
head.update({k.lower(): v for k, v in headers.items()})
|
|
57
|
+
|
|
58
|
+
self.client = httpx.Client(headers=head, follow_redirects=True, timeout=15.0)
|
|
59
|
+
|
|
60
|
+
self.last_req = 0
|
|
61
|
+
|
|
62
|
+
if max_requests_per_minute and max_requests_per_minute > 0:
|
|
63
|
+
self.delay = max(self.delay, 60.0 / max_requests_per_minute)
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def soup(self):
|
|
67
|
+
"""Get the current BeautifulSoup object from thread-local state."""
|
|
68
|
+
return getattr(self._state, "soup", None)
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def response(self):
|
|
72
|
+
"""Get the current response object from thread-local state."""
|
|
73
|
+
return getattr(self._state, "res", None)
|
|
74
|
+
|
|
75
|
+
def _init_browser(self):
|
|
76
|
+
"""Initialize browser if not already initialized."""
|
|
77
|
+
if self._browser:
|
|
78
|
+
return
|
|
79
|
+
|
|
80
|
+
self._playwright = sync_playwright().start()
|
|
81
|
+
self._browser = self._playwright.chromium.launch(headless=True)
|
|
82
|
+
self._context = self._browser.new_context(
|
|
83
|
+
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
84
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
85
|
+
"Chrome/121.0.0.0 Safari/537.36",
|
|
86
|
+
locale="en-US",
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
def _close_browser(self):
|
|
90
|
+
"""Close browser and cleanup resources."""
|
|
91
|
+
if self._context:
|
|
92
|
+
self._context.close()
|
|
93
|
+
if self._browser:
|
|
94
|
+
self._browser.close()
|
|
95
|
+
if self._playwright:
|
|
96
|
+
self._playwright.stop()
|
|
97
|
+
|
|
98
|
+
self._context = None
|
|
99
|
+
self._browser = None
|
|
100
|
+
self._playwright = None
|
|
101
|
+
|
|
102
|
+
def _wait(self):
|
|
103
|
+
"""Enforce rate limiting between requests."""
|
|
104
|
+
with self._lock:
|
|
105
|
+
passed = time.time() - self.last_req
|
|
106
|
+
if passed < self.delay:
|
|
107
|
+
time.sleep(self.delay - passed)
|
|
108
|
+
self.last_req = time.time()
|
|
109
|
+
|
|
110
|
+
def fetch(self, url=None, retries=0, use_browser=False):
|
|
111
|
+
"""
|
|
112
|
+
Fetch a URL and parse its content.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
url: URL to fetch (uses base_url if not provided)
|
|
116
|
+
retries: Number of retries on failure
|
|
117
|
+
use_browser: Force using Playwright browser instead of httpx
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
self if successful, None if failed
|
|
121
|
+
"""
|
|
122
|
+
target = url or self.base_url
|
|
123
|
+
if not target:
|
|
124
|
+
self.logger.warning("No URL provided to fetch")
|
|
125
|
+
return self
|
|
126
|
+
|
|
127
|
+
if use_browser:
|
|
128
|
+
return self._fetch_browser(target)
|
|
129
|
+
|
|
130
|
+
for i in range(retries + 1):
|
|
131
|
+
try:
|
|
132
|
+
self._wait()
|
|
133
|
+
self.logger.debug(f"Fetching: {target}")
|
|
134
|
+
|
|
135
|
+
res = self.client.get(target)
|
|
136
|
+
|
|
137
|
+
text_low = res.text.lower()
|
|
138
|
+
# Check for common blocking indicators
|
|
139
|
+
blocking_indicators = (
|
|
140
|
+
res.status_code in (403, 429),
|
|
141
|
+
"captcha" in text_low,
|
|
142
|
+
"cloudflare" in text_low,
|
|
143
|
+
"verify you are human" in text_low,
|
|
144
|
+
"access denied" in text_low,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
if any(blocking_indicators):
|
|
148
|
+
self.logger.warning(
|
|
149
|
+
"Access blocked or soft-blocked -> switching to browser mode"
|
|
150
|
+
)
|
|
151
|
+
return self._fetch_browser(target)
|
|
152
|
+
|
|
153
|
+
if self.strict:
|
|
154
|
+
res.raise_for_status()
|
|
155
|
+
|
|
156
|
+
self._state.res = res
|
|
157
|
+
self._state.soup = BeautifulSoup(res.text, "html.parser")
|
|
158
|
+
return self
|
|
159
|
+
|
|
160
|
+
except httpx.HTTPError as e:
|
|
161
|
+
self.logger.error(f"HTTP error on {target}: {e}")
|
|
162
|
+
if self.strict and i == retries:
|
|
163
|
+
raise
|
|
164
|
+
if i < retries:
|
|
165
|
+
time.sleep(1)
|
|
166
|
+
except Exception as e:
|
|
167
|
+
self.logger.error(f"Error on {target}: {e}")
|
|
168
|
+
if self.strict and i == retries:
|
|
169
|
+
raise
|
|
170
|
+
if i < retries:
|
|
171
|
+
time.sleep(1)
|
|
172
|
+
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
def _fetch_browser(self, target, wait=1.5):
|
|
176
|
+
"""
|
|
177
|
+
Fetch URL using Playwright browser for JavaScript-heavy pages.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
target: URL to fetch
|
|
181
|
+
wait: Time to wait after page load (seconds)
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
self if successful
|
|
185
|
+
"""
|
|
186
|
+
self.logger.debug(f"[browser] fetching: {target}")
|
|
187
|
+
|
|
188
|
+
self._init_browser()
|
|
189
|
+
|
|
190
|
+
page = self._context.new_page()
|
|
191
|
+
try:
|
|
192
|
+
try:
|
|
193
|
+
page.goto(target, wait_until="networkidle", timeout=15000)
|
|
194
|
+
except PlaywrightTimeout:
|
|
195
|
+
self.logger.warning(
|
|
196
|
+
f"Playwright timeout for {target}, continuing anyway"
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
time.sleep(wait)
|
|
200
|
+
html = page.content()
|
|
201
|
+
except Exception as e:
|
|
202
|
+
self.logger.error(f"Browser error fetching {target}: {e}")
|
|
203
|
+
return None
|
|
204
|
+
finally:
|
|
205
|
+
page.close()
|
|
206
|
+
|
|
207
|
+
# Create mock response object
|
|
208
|
+
mock_response = type(
|
|
209
|
+
"Response", (), {"url": target, "text": html, "status_code": 200}
|
|
210
|
+
)()
|
|
211
|
+
self._state.res = mock_response
|
|
212
|
+
self._state.soup = BeautifulSoup(html, "html.parser")
|
|
213
|
+
return self
|
|
214
|
+
|
|
215
|
+
def render_js(self, wait=2):
|
|
216
|
+
"""
|
|
217
|
+
Re-fetch current page with JavaScript rendering enabled.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
wait: Time to wait after page load (seconds)
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
self if successful
|
|
224
|
+
"""
|
|
225
|
+
if not self.response:
|
|
226
|
+
self.logger.warning("No current response to render")
|
|
227
|
+
return self
|
|
228
|
+
return self._fetch_browser(str(self.response.url), wait=wait)
|
|
229
|
+
|
|
230
|
+
def get_text(self, selector):
|
|
231
|
+
"""
|
|
232
|
+
Extract text from first element matching selector.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
selector: CSS selector
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
Text content or empty string
|
|
239
|
+
"""
|
|
240
|
+
if not self.soup:
|
|
241
|
+
self.logger.warning("No soup object available")
|
|
242
|
+
return ""
|
|
243
|
+
el = self.soup.select_one(selector)
|
|
244
|
+
return el.get_text(strip=True) if el else ""
|
|
245
|
+
|
|
246
|
+
def get_text_clean(self, selector):
|
|
247
|
+
"""Extract and clean text from element."""
|
|
248
|
+
return clean_text(self.get_text(selector))
|
|
249
|
+
|
|
250
|
+
def get_links(self):
|
|
251
|
+
"""Extract all unique links from current page."""
|
|
252
|
+
if not self.soup:
|
|
253
|
+
self.logger.warning("No soup object available")
|
|
254
|
+
return []
|
|
255
|
+
|
|
256
|
+
links = set()
|
|
257
|
+
for a in self.soup.find_all("a", href=True):
|
|
258
|
+
try:
|
|
259
|
+
href = a.get("href", "").strip()
|
|
260
|
+
if href: # Skip empty hrefs
|
|
261
|
+
full_url = urljoin(str(self.response.url), href)
|
|
262
|
+
links.add(full_url)
|
|
263
|
+
except Exception as e:
|
|
264
|
+
self.logger.debug(f"Error processing link: {e}")
|
|
265
|
+
continue
|
|
266
|
+
|
|
267
|
+
return list(links)
|
|
268
|
+
|
|
269
|
+
def get_images(self):
|
|
270
|
+
"""Extract all unique image URLs from current page."""
|
|
271
|
+
if not self.soup:
|
|
272
|
+
self.logger.warning("No soup object available")
|
|
273
|
+
return []
|
|
274
|
+
|
|
275
|
+
images = set()
|
|
276
|
+
for img in self.soup.find_all("img", src=True):
|
|
277
|
+
try:
|
|
278
|
+
src = img.get("src", "").strip()
|
|
279
|
+
if src: # Skip empty src attributes
|
|
280
|
+
full_url = urljoin(str(self.response.url), src)
|
|
281
|
+
images.add(full_url)
|
|
282
|
+
except Exception as e:
|
|
283
|
+
self.logger.debug(f"Error processing image: {e}")
|
|
284
|
+
continue
|
|
285
|
+
|
|
286
|
+
return list(images)
|
|
287
|
+
|
|
288
|
+
def get_json(self, url=None):
|
|
289
|
+
"""
|
|
290
|
+
Fetch and parse JSON from URL.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
url: URL to fetch (uses base_url if not provided)
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
Parsed JSON data
|
|
297
|
+
"""
|
|
298
|
+
target = url or self.base_url
|
|
299
|
+
if not target:
|
|
300
|
+
raise ValueError("No URL provided for JSON fetch")
|
|
301
|
+
|
|
302
|
+
self._wait()
|
|
303
|
+
try:
|
|
304
|
+
r = self.client.get(target)
|
|
305
|
+
r.raise_for_status()
|
|
306
|
+
return r.json()
|
|
307
|
+
except httpx.HTTPError as e:
|
|
308
|
+
self.logger.error(f"HTTP error fetching JSON from {target}: {e}")
|
|
309
|
+
raise
|
|
310
|
+
except ValueError as e:
|
|
311
|
+
self.logger.error(f"Invalid JSON from {target}: {e}")
|
|
312
|
+
raise
|
|
313
|
+
finally:
|
|
314
|
+
self.last_req = time.time()
|
|
315
|
+
|
|
316
|
+
def get_table(self, selector=None):
|
|
317
|
+
"""
|
|
318
|
+
Extract table data with support for colspan/rowspan.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
selector: CSS selector for table(s), defaults to ".wikitable"
|
|
322
|
+
|
|
323
|
+
Returns:
|
|
324
|
+
List of lists representing table rows
|
|
325
|
+
"""
|
|
326
|
+
if not self.soup:
|
|
327
|
+
self.logger.warning("No soup object available")
|
|
328
|
+
return []
|
|
329
|
+
|
|
330
|
+
tables = (
|
|
331
|
+
self.soup.select(selector)
|
|
332
|
+
if selector
|
|
333
|
+
else self.soup.select("table.wikitable")
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
if not tables:
|
|
337
|
+
self.logger.debug(f"No tables found with selector: {selector}")
|
|
338
|
+
return []
|
|
339
|
+
|
|
340
|
+
# Find table with most rows
|
|
341
|
+
table = max(tables, key=lambda t: len(t.find_all("tr"))) if tables else None
|
|
342
|
+
if not table:
|
|
343
|
+
return []
|
|
344
|
+
|
|
345
|
+
rows = table.find_all("tr")
|
|
346
|
+
matrix = []
|
|
347
|
+
active_spans = {}
|
|
348
|
+
max_cols = 0
|
|
349
|
+
|
|
350
|
+
for tr in rows:
|
|
351
|
+
cells = tr.find_all(["td", "th"])
|
|
352
|
+
row = []
|
|
353
|
+
col = 0
|
|
354
|
+
cell_i = 0
|
|
355
|
+
|
|
356
|
+
while cell_i < len(cells) or col in active_spans:
|
|
357
|
+
if col in active_spans:
|
|
358
|
+
val, remaining = active_spans[col]
|
|
359
|
+
row.append(val)
|
|
360
|
+
if remaining > 1:
|
|
361
|
+
active_spans[col][1] -= 1
|
|
362
|
+
else:
|
|
363
|
+
del active_spans[col]
|
|
364
|
+
col += 1
|
|
365
|
+
continue
|
|
366
|
+
|
|
367
|
+
if cell_i >= len(cells):
|
|
368
|
+
break
|
|
369
|
+
|
|
370
|
+
cell = cells[cell_i]
|
|
371
|
+
cell_i += 1
|
|
372
|
+
|
|
373
|
+
try:
|
|
374
|
+
rowspan = int(cell.get("rowspan", 1))
|
|
375
|
+
colspan = int(cell.get("colspan", 1))
|
|
376
|
+
except (ValueError, TypeError):
|
|
377
|
+
rowspan, colspan = 1, 1
|
|
378
|
+
|
|
379
|
+
value = cell.get_text(" ", strip=True)
|
|
380
|
+
|
|
381
|
+
if rowspan > 1:
|
|
382
|
+
for c in range(colspan):
|
|
383
|
+
active_spans[col + c] = [value, rowspan - 1]
|
|
384
|
+
|
|
385
|
+
for _ in range(colspan):
|
|
386
|
+
row.append(value)
|
|
387
|
+
col += 1
|
|
388
|
+
|
|
389
|
+
max_cols = max(max_cols, len(row))
|
|
390
|
+
matrix.append(row)
|
|
391
|
+
|
|
392
|
+
# Pad rows with empty strings
|
|
393
|
+
for r in matrix:
|
|
394
|
+
if len(r) < max_cols:
|
|
395
|
+
r.extend([""] * (max_cols - len(r)))
|
|
396
|
+
|
|
397
|
+
return matrix
|
|
398
|
+
|
|
399
|
+
def scrape_pages(self, url_pattern, pages, selector):
|
|
400
|
+
"""
|
|
401
|
+
Scrape multiple pages with numbered URL pattern.
|
|
402
|
+
|
|
403
|
+
Args:
|
|
404
|
+
url_pattern: URL pattern with {}, e.g., "https://example.com/page/{}"
|
|
405
|
+
pages: Number of pages to scrape
|
|
406
|
+
selector: CSS selector for elements to extract
|
|
407
|
+
|
|
408
|
+
Returns:
|
|
409
|
+
List of extracted text values
|
|
410
|
+
"""
|
|
411
|
+
if pages < 1:
|
|
412
|
+
raise ValueError("pages must be >= 1")
|
|
413
|
+
|
|
414
|
+
results = []
|
|
415
|
+
for i in range(1, pages + 1):
|
|
416
|
+
self.logger.debug(f"Scraping page {i}/{pages}")
|
|
417
|
+
try:
|
|
418
|
+
formatted_url = url_pattern.format(i)
|
|
419
|
+
if self.fetch(formatted_url):
|
|
420
|
+
results.extend(
|
|
421
|
+
[el.get_text(strip=True) for el in self.soup.select(selector)]
|
|
422
|
+
)
|
|
423
|
+
except Exception as e:
|
|
424
|
+
self.logger.error(f"Error scraping page {i}: {e}")
|
|
425
|
+
if self.strict:
|
|
426
|
+
raise
|
|
427
|
+
return results
|
|
428
|
+
|
|
429
|
+
def scrape_auto_next(self, url, selector, max_pages=10):
|
|
430
|
+
"""
|
|
431
|
+
Scrape pages by following "Next" button.
|
|
432
|
+
|
|
433
|
+
Args:
|
|
434
|
+
url: Starting URL
|
|
435
|
+
selector: CSS selector for elements to extract
|
|
436
|
+
max_pages: Maximum pages to scrape
|
|
437
|
+
|
|
438
|
+
Returns:
|
|
439
|
+
List of extracted text values
|
|
440
|
+
"""
|
|
441
|
+
if max_pages < 1:
|
|
442
|
+
raise ValueError("max_pages must be >= 1")
|
|
443
|
+
|
|
444
|
+
data, curr = [], url
|
|
445
|
+
for page_num in range(max_pages):
|
|
446
|
+
self.logger.debug(f"Scraping auto-next page {page_num + 1}/{max_pages}")
|
|
447
|
+
|
|
448
|
+
if not self.fetch(curr):
|
|
449
|
+
break
|
|
450
|
+
|
|
451
|
+
data.extend([el.get_text(strip=True) for el in self.soup.select(selector)])
|
|
452
|
+
|
|
453
|
+
# IMPROVED: More resilient "Next" button detection
|
|
454
|
+
nxt = (
|
|
455
|
+
self.soup.find("a", string=lambda t: t and "next" in t.lower().strip())
|
|
456
|
+
or self.soup.find("a", attrs={"rel": "next"})
|
|
457
|
+
or self.soup.select_one("li.next a")
|
|
458
|
+
or self.soup.select_one("a.next")
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
if nxt and nxt.get("href"):
|
|
462
|
+
curr = urljoin(str(self.response.url), nxt["href"])
|
|
463
|
+
else:
|
|
464
|
+
self.logger.debug("No 'Next' button found, stopping pagination")
|
|
465
|
+
break
|
|
466
|
+
|
|
467
|
+
return data
|
|
468
|
+
|
|
469
|
+
def fetch_multiple(self, urls, workers=5):
|
|
470
|
+
"""
|
|
471
|
+
Fetch multiple URLs in parallel.
|
|
472
|
+
|
|
473
|
+
Args:
|
|
474
|
+
urls: List of URLs to fetch
|
|
475
|
+
workers: Number of worker threads
|
|
476
|
+
|
|
477
|
+
Returns:
|
|
478
|
+
List of tuples (url, success_status)
|
|
479
|
+
"""
|
|
480
|
+
if not urls:
|
|
481
|
+
self.logger.warning("No URLs provided for parallel fetch")
|
|
482
|
+
return []
|
|
483
|
+
|
|
484
|
+
if workers < 1:
|
|
485
|
+
raise ValueError("workers must be >= 1")
|
|
486
|
+
|
|
487
|
+
with ThreadPoolExecutor(max_workers=workers) as pool:
|
|
488
|
+
return list(pool.map(lambda u: (u, self.fetch(u) is not None), urls))
|
|
489
|
+
|
|
490
|
+
def submit_form(self, url, data):
|
|
491
|
+
"""
|
|
492
|
+
Submit form data via POST.
|
|
493
|
+
|
|
494
|
+
Args:
|
|
495
|
+
url: Form endpoint URL
|
|
496
|
+
data: Dictionary of form data
|
|
497
|
+
|
|
498
|
+
Returns:
|
|
499
|
+
self
|
|
500
|
+
"""
|
|
501
|
+
if not url or not data:
|
|
502
|
+
raise ValueError("url and data are required")
|
|
503
|
+
|
|
504
|
+
self._wait()
|
|
505
|
+
try:
|
|
506
|
+
res = self.client.post(url, data=data)
|
|
507
|
+
res.raise_for_status()
|
|
508
|
+
self._state.res = res
|
|
509
|
+
self._state.soup = BeautifulSoup(res.text, "html.parser")
|
|
510
|
+
except httpx.HTTPError as e:
|
|
511
|
+
self.logger.error(f"HTTP error submitting form to {url}: {e}")
|
|
512
|
+
if self.strict:
|
|
513
|
+
raise
|
|
514
|
+
return self
|
|
515
|
+
|
|
516
|
+
def download_file(self, url, dest):
|
|
517
|
+
"""
|
|
518
|
+
Download file from URL.
|
|
519
|
+
|
|
520
|
+
Args:
|
|
521
|
+
url: File URL
|
|
522
|
+
dest: Destination file path
|
|
523
|
+
"""
|
|
524
|
+
if not url or not dest:
|
|
525
|
+
raise ValueError("url and dest are required")
|
|
526
|
+
|
|
527
|
+
try:
|
|
528
|
+
os.makedirs(os.path.dirname(dest) or ".", exist_ok=True)
|
|
529
|
+
with self.client.stream("GET", url) as r:
|
|
530
|
+
r.raise_for_status()
|
|
531
|
+
with open(dest, "wb") as f:
|
|
532
|
+
for chunk in r.iter_bytes():
|
|
533
|
+
f.write(chunk)
|
|
534
|
+
self.logger.debug(f"Downloaded: {dest}")
|
|
535
|
+
except httpx.HTTPError as e:
|
|
536
|
+
self.logger.error(f"HTTP error downloading {url}: {e}")
|
|
537
|
+
raise
|
|
538
|
+
except IOError as e:
|
|
539
|
+
self.logger.error(f"Error writing to {dest}: {e}")
|
|
540
|
+
raise
|
|
541
|
+
|
|
542
|
+
def download_images(self, folder="images/"):
|
|
543
|
+
"""
|
|
544
|
+
Download all images from current page.
|
|
545
|
+
|
|
546
|
+
Args:
|
|
547
|
+
folder: Destination folder for images
|
|
548
|
+
"""
|
|
549
|
+
if not folder:
|
|
550
|
+
raise ValueError("folder is required")
|
|
551
|
+
|
|
552
|
+
images = self.get_images()
|
|
553
|
+
if not images:
|
|
554
|
+
self.logger.warning("No images found to download")
|
|
555
|
+
return
|
|
556
|
+
|
|
557
|
+
for i, url in enumerate(images):
|
|
558
|
+
try:
|
|
559
|
+
ext = url.split(".")[-1].split("?")[0][:3]
|
|
560
|
+
if len(ext) > 3 or not ext:
|
|
561
|
+
ext = "jpg"
|
|
562
|
+
dest = os.path.join(folder, f"img_{i}.{ext}")
|
|
563
|
+
self.download_file(url, dest)
|
|
564
|
+
except Exception as e:
|
|
565
|
+
self.logger.error(f"Error downloading image {i} from {url}: {e}")
|
|
566
|
+
if self.strict:
|
|
567
|
+
raise
|
|
568
|
+
|
|
569
|
+
def list_selectors(self):
|
|
570
|
+
"""List available HTML tags, IDs, and classes for debugging."""
|
|
571
|
+
if not self.soup:
|
|
572
|
+
self.logger.warning("No soup object available")
|
|
573
|
+
return
|
|
574
|
+
|
|
575
|
+
tags = list(set(el.name for el in self.soup.find_all()))[:15]
|
|
576
|
+
ids = list(set(el["id"] for el in self.soup.find_all(id=True)))[:15]
|
|
577
|
+
classes = list(
|
|
578
|
+
set(c for el in self.soup.find_all(class_=True) for c in el["class"])
|
|
579
|
+
)[:15]
|
|
580
|
+
|
|
581
|
+
print(f"tags: {tags}\nids: {ids}\nclasses: {classes}")
|
|
582
|
+
|
|
583
|
+
def export_csv(self, data, path):
|
|
584
|
+
"""Export data to CSV file."""
|
|
585
|
+
if not path:
|
|
586
|
+
raise ValueError("path is required")
|
|
587
|
+
save_csv(data, path)
|
|
588
|
+
self.logger.debug(f"Exported CSV: {path}")
|
|
589
|
+
|
|
590
|
+
def export_json(self, data, path):
|
|
591
|
+
"""Export data to JSON file."""
|
|
592
|
+
if not path:
|
|
593
|
+
raise ValueError("path is required")
|
|
594
|
+
save_json(data, path)
|
|
595
|
+
self.logger.debug(f"Exported JSON: {path}")
|
|
596
|
+
|
|
597
|
+
def close(self):
|
|
598
|
+
"""Close client and cleanup resources."""
|
|
599
|
+
self.client.close()
|
|
600
|
+
self._close_browser()
|
|
601
|
+
self.logger.debug("Scraper closed")
|
ezextract/utils.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import csv
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def clean_text(text):
|
|
7
|
+
return " ".join(text.split())
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def save_csv(data, path):
|
|
11
|
+
# exports data to csv
|
|
12
|
+
os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
|
|
13
|
+
with open(path, "w", newline="", encoding="utf-8") as f:
|
|
14
|
+
w = csv.writer(f)
|
|
15
|
+
if data and isinstance(data[0], dict):
|
|
16
|
+
w.writerow(data[0].keys())
|
|
17
|
+
for r in data:
|
|
18
|
+
w.writerow(r.values())
|
|
19
|
+
else:
|
|
20
|
+
w.writerows(data)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def save_json(data, path):
|
|
24
|
+
# exports data to json
|
|
25
|
+
os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
|
|
26
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
27
|
+
json.dump(data, f, indent=4)
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ezextract
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: A simple, human-friendly web scraper wrapper using httpx, BeautifulSoup, and Playwright.
|
|
5
|
+
Project-URL: Homepage, https://github.com/E4crypt3d/ezextract
|
|
6
|
+
Project-URL: Bug Tracker, https://github.com/E4crypt3d/ezextract/issues
|
|
7
|
+
Author: E4crypt3d
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: crawler,data-extraction,httpx,playwright,scraper,webscraping
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
21
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
22
|
+
Requires-Python: >=3.8
|
|
23
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
24
|
+
Requires-Dist: httpx>=0.24.0
|
|
25
|
+
Requires-Dist: playwright>=1.35.0
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# Scraper
|
|
29
|
+
|
|
30
|
+
A simple and practical Python web scraper for real‑world websites. It supports normal HTTP scraping, JavaScript rendering via Playwright, rate limiting, pagination, form submission, file downloads, and structured data extraction.
|
|
31
|
+
|
|
32
|
+
Designed to be usable, reliable, and flexible.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Features
|
|
37
|
+
|
|
38
|
+
* HTTP scraping with `httpx`
|
|
39
|
+
* Automatic fallback to browser mode (Playwright) when blocked
|
|
40
|
+
* JavaScript rendering support
|
|
41
|
+
* Rate limiting
|
|
42
|
+
* Parallel fetching
|
|
43
|
+
* Table extraction with `rowspan` / `colspan`
|
|
44
|
+
* Pagination (patterned pages + "Next" button)
|
|
45
|
+
* JSON endpoints
|
|
46
|
+
* Form submission
|
|
47
|
+
* Image and file downloading
|
|
48
|
+
* CSV / JSON export
|
|
49
|
+
* Logging system
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Install
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install httpx beautifulsoup4 playwright
|
|
57
|
+
playwright install chromium
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Basic Usage
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from scraper import Scraper
|
|
66
|
+
|
|
67
|
+
s = Scraper(url="https://example.com", debug=True)
|
|
68
|
+
s.fetch()
|
|
69
|
+
|
|
70
|
+
print(s.get_text("h1"))
|
|
71
|
+
print(s.get_links())
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## Common Examples
|
|
77
|
+
|
|
78
|
+
### Fetch a page
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
s.fetch("https://example.com")
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Force JS rendering
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
s.fetch("https://site.com", use_browser=True)
|
|
88
|
+
# or
|
|
89
|
+
s.render_js()
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Extract text
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
s.get_text(".title")
|
|
96
|
+
s.get_text_clean(".content")
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Get links & images
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
s.get_links()
|
|
103
|
+
s.get_images()
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Extract tables
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
table = s.get_table()
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### Scrape paginated pages
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
s.scrape_pages("https://site.com/page/{}", 5, ".item")
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Auto "Next" pagination
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
s.scrape_auto_next("https://site.com", ".post")
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Parallel fetch
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
urls = ["https://a.com", "https://b.com"]
|
|
128
|
+
s.fetch_multiple(urls, workers=5)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### JSON API
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
s.get_json("https://api.site.com/data")
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Submit form
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
s.submit_form("https://site.com/login", {
|
|
141
|
+
"user": "name",
|
|
142
|
+
"pass": "password"
|
|
143
|
+
})
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### Download files
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
s.download_file(url, "file.pdf")
|
|
150
|
+
s.download_images("images/")
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### Export
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
s.export_csv(data, "data.csv")
|
|
157
|
+
s.export_json(data, "data.json")
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## Close resources
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
s.close()
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## Notes
|
|
171
|
+
|
|
172
|
+
* Automatically switches to browser mode if blocked
|
|
173
|
+
* Thread‑safe request handling
|
|
174
|
+
* Suitable for large scraping jobs
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
ezextract/__init__.py,sha256=yDrFgRAeM17g3G8P7rYuDvZEMv7_EPsd_PnrDcFdpRg,19424
|
|
2
|
+
ezextract/utils.py,sha256=GEYnY2uyHu6H0RZgNdBqL9aTCf1CBnkWy3iU-1fLs4k,719
|
|
3
|
+
ezextract-0.1.1.dist-info/METADATA,sha256=zRd0O3Ji-Oy7pPWB6jQdOzFyI5YJp2y1kue9m99r65k,3325
|
|
4
|
+
ezextract-0.1.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
5
|
+
ezextract-0.1.1.dist-info/licenses/LICENSE,sha256=FlCDijFBNIEvSOuLiIeEHZSDe27VNZUvw8_WlC6sUYE,1118
|
|
6
|
+
ezextract-0.1.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 E4crypt3d (https://github.com/E4crypt3d)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
8
|
+
|
|
9
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
10
|
+
|
|
11
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|