ezextract 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ezextract/__init__.py ADDED
@@ -0,0 +1,601 @@
1
+ import time
2
+ import os
3
+ import httpx
4
+ import threading
5
+ import logging
6
+ from bs4 import BeautifulSoup
7
+ from urllib.parse import urljoin
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
10
+
11
+ from .utils import clean_text, save_csv, save_json
12
+
13
+
14
+ class Scraper:
15
+ def __init__(
16
+ self,
17
+ url=None,
18
+ delay=0.0,
19
+ headers=None,
20
+ max_requests_per_minute=None,
21
+ debug=False,
22
+ strict=False,
23
+ ):
24
+ self._playwright = None
25
+ self._browser = None
26
+ self._context = None
27
+
28
+ self.base_url = url
29
+ self.delay = delay
30
+ self.debug = debug
31
+ self.strict = strict
32
+
33
+ self._state = threading.local()
34
+ self._lock = threading.Lock()
35
+
36
+ # Setup logging
37
+ self.logger = logging.getLogger(__name__)
38
+ if debug:
39
+ self.logger.setLevel(logging.DEBUG)
40
+ else:
41
+ self.logger.setLevel(logging.WARNING)
42
+
43
+ # headers
44
+ head = {
45
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
46
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
47
+ "Chrome/121.0.0.0 Safari/537.36",
48
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
49
+ "Accept-Language": "en-US,en;q=0.9",
50
+ "Referer": "https://www.google.com/",
51
+ "Connection": "keep-alive",
52
+ "Upgrade-Insecure-Requests": "1",
53
+ }
54
+
55
+ if headers:
56
+ head.update({k.lower(): v for k, v in headers.items()})
57
+
58
+ self.client = httpx.Client(headers=head, follow_redirects=True, timeout=15.0)
59
+
60
+ self.last_req = 0
61
+
62
+ if max_requests_per_minute and max_requests_per_minute > 0:
63
+ self.delay = max(self.delay, 60.0 / max_requests_per_minute)
64
+
65
+ @property
66
+ def soup(self):
67
+ """Get the current BeautifulSoup object from thread-local state."""
68
+ return getattr(self._state, "soup", None)
69
+
70
+ @property
71
+ def response(self):
72
+ """Get the current response object from thread-local state."""
73
+ return getattr(self._state, "res", None)
74
+
75
+ def _init_browser(self):
76
+ """Initialize browser if not already initialized."""
77
+ if self._browser:
78
+ return
79
+
80
+ self._playwright = sync_playwright().start()
81
+ self._browser = self._playwright.chromium.launch(headless=True)
82
+ self._context = self._browser.new_context(
83
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
84
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
85
+ "Chrome/121.0.0.0 Safari/537.36",
86
+ locale="en-US",
87
+ )
88
+
89
+ def _close_browser(self):
90
+ """Close browser and cleanup resources."""
91
+ if self._context:
92
+ self._context.close()
93
+ if self._browser:
94
+ self._browser.close()
95
+ if self._playwright:
96
+ self._playwright.stop()
97
+
98
+ self._context = None
99
+ self._browser = None
100
+ self._playwright = None
101
+
102
+ def _wait(self):
103
+ """Enforce rate limiting between requests."""
104
+ with self._lock:
105
+ passed = time.time() - self.last_req
106
+ if passed < self.delay:
107
+ time.sleep(self.delay - passed)
108
+ self.last_req = time.time()
109
+
110
+ def fetch(self, url=None, retries=0, use_browser=False):
111
+ """
112
+ Fetch a URL and parse its content.
113
+
114
+ Args:
115
+ url: URL to fetch (uses base_url if not provided)
116
+ retries: Number of retries on failure
117
+ use_browser: Force using Playwright browser instead of httpx
118
+
119
+ Returns:
120
+ self if successful, None if failed
121
+ """
122
+ target = url or self.base_url
123
+ if not target:
124
+ self.logger.warning("No URL provided to fetch")
125
+ return self
126
+
127
+ if use_browser:
128
+ return self._fetch_browser(target)
129
+
130
+ for i in range(retries + 1):
131
+ try:
132
+ self._wait()
133
+ self.logger.debug(f"Fetching: {target}")
134
+
135
+ res = self.client.get(target)
136
+
137
+ text_low = res.text.lower()
138
+ # Check for common blocking indicators
139
+ blocking_indicators = (
140
+ res.status_code in (403, 429),
141
+ "captcha" in text_low,
142
+ "cloudflare" in text_low,
143
+ "verify you are human" in text_low,
144
+ "access denied" in text_low,
145
+ )
146
+
147
+ if any(blocking_indicators):
148
+ self.logger.warning(
149
+ "Access blocked or soft-blocked -> switching to browser mode"
150
+ )
151
+ return self._fetch_browser(target)
152
+
153
+ if self.strict:
154
+ res.raise_for_status()
155
+
156
+ self._state.res = res
157
+ self._state.soup = BeautifulSoup(res.text, "html.parser")
158
+ return self
159
+
160
+ except httpx.HTTPError as e:
161
+ self.logger.error(f"HTTP error on {target}: {e}")
162
+ if self.strict and i == retries:
163
+ raise
164
+ if i < retries:
165
+ time.sleep(1)
166
+ except Exception as e:
167
+ self.logger.error(f"Error on {target}: {e}")
168
+ if self.strict and i == retries:
169
+ raise
170
+ if i < retries:
171
+ time.sleep(1)
172
+
173
+ return None
174
+
175
+ def _fetch_browser(self, target, wait=1.5):
176
+ """
177
+ Fetch URL using Playwright browser for JavaScript-heavy pages.
178
+
179
+ Args:
180
+ target: URL to fetch
181
+ wait: Time to wait after page load (seconds)
182
+
183
+ Returns:
184
+ self if successful
185
+ """
186
+ self.logger.debug(f"[browser] fetching: {target}")
187
+
188
+ self._init_browser()
189
+
190
+ page = self._context.new_page()
191
+ try:
192
+ try:
193
+ page.goto(target, wait_until="networkidle", timeout=15000)
194
+ except PlaywrightTimeout:
195
+ self.logger.warning(
196
+ f"Playwright timeout for {target}, continuing anyway"
197
+ )
198
+
199
+ time.sleep(wait)
200
+ html = page.content()
201
+ except Exception as e:
202
+ self.logger.error(f"Browser error fetching {target}: {e}")
203
+ return None
204
+ finally:
205
+ page.close()
206
+
207
+ # Create mock response object
208
+ mock_response = type(
209
+ "Response", (), {"url": target, "text": html, "status_code": 200}
210
+ )()
211
+ self._state.res = mock_response
212
+ self._state.soup = BeautifulSoup(html, "html.parser")
213
+ return self
214
+
215
+ def render_js(self, wait=2):
216
+ """
217
+ Re-fetch current page with JavaScript rendering enabled.
218
+
219
+ Args:
220
+ wait: Time to wait after page load (seconds)
221
+
222
+ Returns:
223
+ self if successful
224
+ """
225
+ if not self.response:
226
+ self.logger.warning("No current response to render")
227
+ return self
228
+ return self._fetch_browser(str(self.response.url), wait=wait)
229
+
230
+ def get_text(self, selector):
231
+ """
232
+ Extract text from first element matching selector.
233
+
234
+ Args:
235
+ selector: CSS selector
236
+
237
+ Returns:
238
+ Text content or empty string
239
+ """
240
+ if not self.soup:
241
+ self.logger.warning("No soup object available")
242
+ return ""
243
+ el = self.soup.select_one(selector)
244
+ return el.get_text(strip=True) if el else ""
245
+
246
+ def get_text_clean(self, selector):
247
+ """Extract and clean text from element."""
248
+ return clean_text(self.get_text(selector))
249
+
250
+ def get_links(self):
251
+ """Extract all unique links from current page."""
252
+ if not self.soup:
253
+ self.logger.warning("No soup object available")
254
+ return []
255
+
256
+ links = set()
257
+ for a in self.soup.find_all("a", href=True):
258
+ try:
259
+ href = a.get("href", "").strip()
260
+ if href: # Skip empty hrefs
261
+ full_url = urljoin(str(self.response.url), href)
262
+ links.add(full_url)
263
+ except Exception as e:
264
+ self.logger.debug(f"Error processing link: {e}")
265
+ continue
266
+
267
+ return list(links)
268
+
269
+ def get_images(self):
270
+ """Extract all unique image URLs from current page."""
271
+ if not self.soup:
272
+ self.logger.warning("No soup object available")
273
+ return []
274
+
275
+ images = set()
276
+ for img in self.soup.find_all("img", src=True):
277
+ try:
278
+ src = img.get("src", "").strip()
279
+ if src: # Skip empty src attributes
280
+ full_url = urljoin(str(self.response.url), src)
281
+ images.add(full_url)
282
+ except Exception as e:
283
+ self.logger.debug(f"Error processing image: {e}")
284
+ continue
285
+
286
+ return list(images)
287
+
288
+ def get_json(self, url=None):
289
+ """
290
+ Fetch and parse JSON from URL.
291
+
292
+ Args:
293
+ url: URL to fetch (uses base_url if not provided)
294
+
295
+ Returns:
296
+ Parsed JSON data
297
+ """
298
+ target = url or self.base_url
299
+ if not target:
300
+ raise ValueError("No URL provided for JSON fetch")
301
+
302
+ self._wait()
303
+ try:
304
+ r = self.client.get(target)
305
+ r.raise_for_status()
306
+ return r.json()
307
+ except httpx.HTTPError as e:
308
+ self.logger.error(f"HTTP error fetching JSON from {target}: {e}")
309
+ raise
310
+ except ValueError as e:
311
+ self.logger.error(f"Invalid JSON from {target}: {e}")
312
+ raise
313
+ finally:
314
+ self.last_req = time.time()
315
+
316
+ def get_table(self, selector=None):
317
+ """
318
+ Extract table data with support for colspan/rowspan.
319
+
320
+ Args:
321
+ selector: CSS selector for table(s), defaults to ".wikitable"
322
+
323
+ Returns:
324
+ List of lists representing table rows
325
+ """
326
+ if not self.soup:
327
+ self.logger.warning("No soup object available")
328
+ return []
329
+
330
+ tables = (
331
+ self.soup.select(selector)
332
+ if selector
333
+ else self.soup.select("table.wikitable")
334
+ )
335
+
336
+ if not tables:
337
+ self.logger.debug(f"No tables found with selector: {selector}")
338
+ return []
339
+
340
+ # Find table with most rows
341
+ table = max(tables, key=lambda t: len(t.find_all("tr"))) if tables else None
342
+ if not table:
343
+ return []
344
+
345
+ rows = table.find_all("tr")
346
+ matrix = []
347
+ active_spans = {}
348
+ max_cols = 0
349
+
350
+ for tr in rows:
351
+ cells = tr.find_all(["td", "th"])
352
+ row = []
353
+ col = 0
354
+ cell_i = 0
355
+
356
+ while cell_i < len(cells) or col in active_spans:
357
+ if col in active_spans:
358
+ val, remaining = active_spans[col]
359
+ row.append(val)
360
+ if remaining > 1:
361
+ active_spans[col][1] -= 1
362
+ else:
363
+ del active_spans[col]
364
+ col += 1
365
+ continue
366
+
367
+ if cell_i >= len(cells):
368
+ break
369
+
370
+ cell = cells[cell_i]
371
+ cell_i += 1
372
+
373
+ try:
374
+ rowspan = int(cell.get("rowspan", 1))
375
+ colspan = int(cell.get("colspan", 1))
376
+ except (ValueError, TypeError):
377
+ rowspan, colspan = 1, 1
378
+
379
+ value = cell.get_text(" ", strip=True)
380
+
381
+ if rowspan > 1:
382
+ for c in range(colspan):
383
+ active_spans[col + c] = [value, rowspan - 1]
384
+
385
+ for _ in range(colspan):
386
+ row.append(value)
387
+ col += 1
388
+
389
+ max_cols = max(max_cols, len(row))
390
+ matrix.append(row)
391
+
392
+ # Pad rows with empty strings
393
+ for r in matrix:
394
+ if len(r) < max_cols:
395
+ r.extend([""] * (max_cols - len(r)))
396
+
397
+ return matrix
398
+
399
+ def scrape_pages(self, url_pattern, pages, selector):
400
+ """
401
+ Scrape multiple pages with numbered URL pattern.
402
+
403
+ Args:
404
+ url_pattern: URL pattern with {}, e.g., "https://example.com/page/{}"
405
+ pages: Number of pages to scrape
406
+ selector: CSS selector for elements to extract
407
+
408
+ Returns:
409
+ List of extracted text values
410
+ """
411
+ if pages < 1:
412
+ raise ValueError("pages must be >= 1")
413
+
414
+ results = []
415
+ for i in range(1, pages + 1):
416
+ self.logger.debug(f"Scraping page {i}/{pages}")
417
+ try:
418
+ formatted_url = url_pattern.format(i)
419
+ if self.fetch(formatted_url):
420
+ results.extend(
421
+ [el.get_text(strip=True) for el in self.soup.select(selector)]
422
+ )
423
+ except Exception as e:
424
+ self.logger.error(f"Error scraping page {i}: {e}")
425
+ if self.strict:
426
+ raise
427
+ return results
428
+
429
+ def scrape_auto_next(self, url, selector, max_pages=10):
430
+ """
431
+ Scrape pages by following "Next" button.
432
+
433
+ Args:
434
+ url: Starting URL
435
+ selector: CSS selector for elements to extract
436
+ max_pages: Maximum pages to scrape
437
+
438
+ Returns:
439
+ List of extracted text values
440
+ """
441
+ if max_pages < 1:
442
+ raise ValueError("max_pages must be >= 1")
443
+
444
+ data, curr = [], url
445
+ for page_num in range(max_pages):
446
+ self.logger.debug(f"Scraping auto-next page {page_num + 1}/{max_pages}")
447
+
448
+ if not self.fetch(curr):
449
+ break
450
+
451
+ data.extend([el.get_text(strip=True) for el in self.soup.select(selector)])
452
+
453
+ # IMPROVED: More resilient "Next" button detection
454
+ nxt = (
455
+ self.soup.find("a", string=lambda t: t and "next" in t.lower().strip())
456
+ or self.soup.find("a", attrs={"rel": "next"})
457
+ or self.soup.select_one("li.next a")
458
+ or self.soup.select_one("a.next")
459
+ )
460
+
461
+ if nxt and nxt.get("href"):
462
+ curr = urljoin(str(self.response.url), nxt["href"])
463
+ else:
464
+ self.logger.debug("No 'Next' button found, stopping pagination")
465
+ break
466
+
467
+ return data
468
+
469
+ def fetch_multiple(self, urls, workers=5):
470
+ """
471
+ Fetch multiple URLs in parallel.
472
+
473
+ Args:
474
+ urls: List of URLs to fetch
475
+ workers: Number of worker threads
476
+
477
+ Returns:
478
+ List of tuples (url, success_status)
479
+ """
480
+ if not urls:
481
+ self.logger.warning("No URLs provided for parallel fetch")
482
+ return []
483
+
484
+ if workers < 1:
485
+ raise ValueError("workers must be >= 1")
486
+
487
+ with ThreadPoolExecutor(max_workers=workers) as pool:
488
+ return list(pool.map(lambda u: (u, self.fetch(u) is not None), urls))
489
+
490
+ def submit_form(self, url, data):
491
+ """
492
+ Submit form data via POST.
493
+
494
+ Args:
495
+ url: Form endpoint URL
496
+ data: Dictionary of form data
497
+
498
+ Returns:
499
+ self
500
+ """
501
+ if not url or not data:
502
+ raise ValueError("url and data are required")
503
+
504
+ self._wait()
505
+ try:
506
+ res = self.client.post(url, data=data)
507
+ res.raise_for_status()
508
+ self._state.res = res
509
+ self._state.soup = BeautifulSoup(res.text, "html.parser")
510
+ except httpx.HTTPError as e:
511
+ self.logger.error(f"HTTP error submitting form to {url}: {e}")
512
+ if self.strict:
513
+ raise
514
+ return self
515
+
516
+ def download_file(self, url, dest):
517
+ """
518
+ Download file from URL.
519
+
520
+ Args:
521
+ url: File URL
522
+ dest: Destination file path
523
+ """
524
+ if not url or not dest:
525
+ raise ValueError("url and dest are required")
526
+
527
+ try:
528
+ os.makedirs(os.path.dirname(dest) or ".", exist_ok=True)
529
+ with self.client.stream("GET", url) as r:
530
+ r.raise_for_status()
531
+ with open(dest, "wb") as f:
532
+ for chunk in r.iter_bytes():
533
+ f.write(chunk)
534
+ self.logger.debug(f"Downloaded: {dest}")
535
+ except httpx.HTTPError as e:
536
+ self.logger.error(f"HTTP error downloading {url}: {e}")
537
+ raise
538
+ except IOError as e:
539
+ self.logger.error(f"Error writing to {dest}: {e}")
540
+ raise
541
+
542
+ def download_images(self, folder="images/"):
543
+ """
544
+ Download all images from current page.
545
+
546
+ Args:
547
+ folder: Destination folder for images
548
+ """
549
+ if not folder:
550
+ raise ValueError("folder is required")
551
+
552
+ images = self.get_images()
553
+ if not images:
554
+ self.logger.warning("No images found to download")
555
+ return
556
+
557
+ for i, url in enumerate(images):
558
+ try:
559
+ ext = url.split(".")[-1].split("?")[0][:3]
560
+ if len(ext) > 3 or not ext:
561
+ ext = "jpg"
562
+ dest = os.path.join(folder, f"img_{i}.{ext}")
563
+ self.download_file(url, dest)
564
+ except Exception as e:
565
+ self.logger.error(f"Error downloading image {i} from {url}: {e}")
566
+ if self.strict:
567
+ raise
568
+
569
+ def list_selectors(self):
570
+ """List available HTML tags, IDs, and classes for debugging."""
571
+ if not self.soup:
572
+ self.logger.warning("No soup object available")
573
+ return
574
+
575
+ tags = list(set(el.name for el in self.soup.find_all()))[:15]
576
+ ids = list(set(el["id"] for el in self.soup.find_all(id=True)))[:15]
577
+ classes = list(
578
+ set(c for el in self.soup.find_all(class_=True) for c in el["class"])
579
+ )[:15]
580
+
581
+ print(f"tags: {tags}\nids: {ids}\nclasses: {classes}")
582
+
583
+ def export_csv(self, data, path):
584
+ """Export data to CSV file."""
585
+ if not path:
586
+ raise ValueError("path is required")
587
+ save_csv(data, path)
588
+ self.logger.debug(f"Exported CSV: {path}")
589
+
590
+ def export_json(self, data, path):
591
+ """Export data to JSON file."""
592
+ if not path:
593
+ raise ValueError("path is required")
594
+ save_json(data, path)
595
+ self.logger.debug(f"Exported JSON: {path}")
596
+
597
+ def close(self):
598
+ """Close client and cleanup resources."""
599
+ self.client.close()
600
+ self._close_browser()
601
+ self.logger.debug("Scraper closed")
ezextract/utils.py ADDED
@@ -0,0 +1,27 @@
1
+ import os
2
+ import csv
3
+ import json
4
+
5
+
6
+ def clean_text(text):
7
+ return " ".join(text.split())
8
+
9
+
10
+ def save_csv(data, path):
11
+ # exports data to csv
12
+ os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
13
+ with open(path, "w", newline="", encoding="utf-8") as f:
14
+ w = csv.writer(f)
15
+ if data and isinstance(data[0], dict):
16
+ w.writerow(data[0].keys())
17
+ for r in data:
18
+ w.writerow(r.values())
19
+ else:
20
+ w.writerows(data)
21
+
22
+
23
+ def save_json(data, path):
24
+ # exports data to json
25
+ os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
26
+ with open(path, "w", encoding="utf-8") as f:
27
+ json.dump(data, f, indent=4)
@@ -0,0 +1,174 @@
1
+ Metadata-Version: 2.4
2
+ Name: ezextract
3
+ Version: 0.1.1
4
+ Summary: A simple, human-friendly web scraper wrapper using httpx, BeautifulSoup, and Playwright.
5
+ Project-URL: Homepage, https://github.com/E4crypt3d/ezextract
6
+ Project-URL: Bug Tracker, https://github.com/E4crypt3d/ezextract/issues
7
+ Author: E4crypt3d
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Keywords: crawler,data-extraction,httpx,playwright,scraper,webscraping
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.8
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Topic :: Internet :: WWW/HTTP
21
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
+ Requires-Python: >=3.8
23
+ Requires-Dist: beautifulsoup4>=4.12.0
24
+ Requires-Dist: httpx>=0.24.0
25
+ Requires-Dist: playwright>=1.35.0
26
+ Description-Content-Type: text/markdown
27
+
28
+ # Scraper
29
+
30
+ A simple and practical Python web scraper for real‑world websites. It supports normal HTTP scraping, JavaScript rendering via Playwright, rate limiting, pagination, form submission, file downloads, and structured data extraction.
31
+
32
+ Designed to be usable, reliable, and flexible.
33
+
34
+ ---
35
+
36
+ ## Features
37
+
38
+ * HTTP scraping with `httpx`
39
+ * Automatic fallback to browser mode (Playwright) when blocked
40
+ * JavaScript rendering support
41
+ * Rate limiting
42
+ * Parallel fetching
43
+ * Table extraction with `rowspan` / `colspan`
44
+ * Pagination (patterned pages + "Next" button)
45
+ * JSON endpoints
46
+ * Form submission
47
+ * Image and file downloading
48
+ * CSV / JSON export
49
+ * Logging system
50
+
51
+ ---
52
+
53
+ ## Install
54
+
55
+ ```bash
56
+ pip install httpx beautifulsoup4 playwright
57
+ playwright install chromium
58
+ ```
59
+
60
+ ---
61
+
62
+ ## Basic Usage
63
+
64
+ ```python
65
+ from scraper import Scraper
66
+
67
+ s = Scraper(url="https://example.com", debug=True)
68
+ s.fetch()
69
+
70
+ print(s.get_text("h1"))
71
+ print(s.get_links())
72
+ ```
73
+
74
+ ---
75
+
76
+ ## Common Examples
77
+
78
+ ### Fetch a page
79
+
80
+ ```python
81
+ s.fetch("https://example.com")
82
+ ```
83
+
84
+ ### Force JS rendering
85
+
86
+ ```python
87
+ s.fetch("https://site.com", use_browser=True)
88
+ # or
89
+ s.render_js()
90
+ ```
91
+
92
+ ### Extract text
93
+
94
+ ```python
95
+ s.get_text(".title")
96
+ s.get_text_clean(".content")
97
+ ```
98
+
99
+ ### Get links & images
100
+
101
+ ```python
102
+ s.get_links()
103
+ s.get_images()
104
+ ```
105
+
106
+ ### Extract tables
107
+
108
+ ```python
109
+ table = s.get_table()
110
+ ```
111
+
112
+ ### Scrape paginated pages
113
+
114
+ ```python
115
+ s.scrape_pages("https://site.com/page/{}", 5, ".item")
116
+ ```
117
+
118
+ ### Auto "Next" pagination
119
+
120
+ ```python
121
+ s.scrape_auto_next("https://site.com", ".post")
122
+ ```
123
+
124
+ ### Parallel fetch
125
+
126
+ ```python
127
+ urls = ["https://a.com", "https://b.com"]
128
+ s.fetch_multiple(urls, workers=5)
129
+ ```
130
+
131
+ ### JSON API
132
+
133
+ ```python
134
+ s.get_json("https://api.site.com/data")
135
+ ```
136
+
137
+ ### Submit form
138
+
139
+ ```python
140
+ s.submit_form("https://site.com/login", {
141
+ "user": "name",
142
+ "pass": "password"
143
+ })
144
+ ```
145
+
146
+ ### Download files
147
+
148
+ ```python
149
+ s.download_file(url, "file.pdf")
150
+ s.download_images("images/")
151
+ ```
152
+
153
+ ### Export
154
+
155
+ ```python
156
+ s.export_csv(data, "data.csv")
157
+ s.export_json(data, "data.json")
158
+ ```
159
+
160
+ ---
161
+
162
+ ## Close resources
163
+
164
+ ```python
165
+ s.close()
166
+ ```
167
+
168
+ ---
169
+
170
+ ## Notes
171
+
172
+ * Automatically switches to browser mode if blocked
173
+ * Thread‑safe request handling
174
+ * Suitable for large scraping jobs
@@ -0,0 +1,6 @@
1
+ ezextract/__init__.py,sha256=yDrFgRAeM17g3G8P7rYuDvZEMv7_EPsd_PnrDcFdpRg,19424
2
+ ezextract/utils.py,sha256=GEYnY2uyHu6H0RZgNdBqL9aTCf1CBnkWy3iU-1fLs4k,719
3
+ ezextract-0.1.1.dist-info/METADATA,sha256=zRd0O3Ji-Oy7pPWB6jQdOzFyI5YJp2y1kue9m99r65k,3325
4
+ ezextract-0.1.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
5
+ ezextract-0.1.1.dist-info/licenses/LICENSE,sha256=FlCDijFBNIEvSOuLiIeEHZSDe27VNZUvw8_WlC6sUYE,1118
6
+ ezextract-0.1.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,11 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 E4crypt3d (https://github.com/E4crypt3d)
4
+
5
+
6
+
7
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
8
+
9
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
10
+
11
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.