ezextract 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,177 @@
1
+ # Created by https://www.toptal.com/developers/gitignore/api/python
2
+ # Edit at https://www.toptal.com/developers/gitignore?templates=python
3
+
4
+ ### Python ###
5
+ # Byte-compiled / optimized / DLL files
6
+ __pycache__/
7
+ *.py[cod]
8
+ *$py.class
9
+
10
+ # C extensions
11
+ *.so
12
+
13
+ # Distribution / packaging
14
+ .Python
15
+ build/
16
+ develop-eggs/
17
+ dist/
18
+ downloads/
19
+ eggs/
20
+ .eggs/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ var/
26
+ wheels/
27
+ share/python-wheels/
28
+ *.egg-info/
29
+ .installed.cfg
30
+ *.egg
31
+ MANIFEST
32
+
33
+ # PyInstaller
34
+ # Usually these files are written by a python script from a template
35
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
36
+ *.manifest
37
+ *.spec
38
+
39
+ # Installer logs
40
+ pip-log.txt
41
+ pip-delete-this-directory.txt
42
+
43
+ # Unit test / coverage reports
44
+ htmlcov/
45
+ .tox/
46
+ .nox/
47
+ .coverage
48
+ .coverage.*
49
+ .cache
50
+ nosetests.xml
51
+ coverage.xml
52
+ *.cover
53
+ *.py,cover
54
+ .hypothesis/
55
+ .pytest_cache/
56
+ cover/
57
+
58
+ # Translations
59
+ *.mo
60
+ *.pot
61
+
62
+ # Django stuff:
63
+ *.log
64
+ local_settings.py
65
+ db.sqlite3
66
+ db.sqlite3-journal
67
+
68
+ # Flask stuff:
69
+ instance/
70
+ .webassets-cache
71
+
72
+ # Scrapy stuff:
73
+ .scrapy
74
+
75
+ # Sphinx documentation
76
+ docs/_build/
77
+
78
+ # PyBuilder
79
+ .pybuilder/
80
+ target/
81
+
82
+ # Jupyter Notebook
83
+ .ipynb_checkpoints
84
+
85
+ # IPython
86
+ profile_default/
87
+ ipython_config.py
88
+
89
+ # pyenv
90
+ # For a library or package, you might want to ignore these files since the code is
91
+ # intended to run in multiple environments; otherwise, check them in:
92
+ # .python-version
93
+
94
+ # pipenv
95
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
97
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
98
+ # install all needed dependencies.
99
+ #Pipfile.lock
100
+
101
+ # poetry
102
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
104
+ # commonly ignored for libraries.
105
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106
+ #poetry.lock
107
+
108
+ # pdm
109
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110
+ #pdm.lock
111
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112
+ # in version control.
113
+ # https://pdm.fming.dev/#use-with-ide
114
+ .pdm.toml
115
+
116
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117
+ __pypackages__/
118
+
119
+ # Celery stuff
120
+ celerybeat-schedule
121
+ celerybeat.pid
122
+
123
+ # SageMath parsed files
124
+ *.sage.py
125
+
126
+ # Environments
127
+ .env
128
+ .venv
129
+ env/
130
+ venv/
131
+ ENV/
132
+ env.bak/
133
+ venv.bak/
134
+
135
+ # Spyder project settings
136
+ .spyderproject
137
+ .spyproject
138
+
139
+ # Rope project settings
140
+ .ropeproject
141
+
142
+ # mkdocs documentation
143
+ /site
144
+
145
+ # mypy
146
+ .mypy_cache/
147
+ .dmypy.json
148
+ dmypy.json
149
+
150
+ # Pyre type checker
151
+ .pyre/
152
+
153
+ # pytype static type analyzer
154
+ .pytype/
155
+
156
+ # Cython debug symbols
157
+ cython_debug/
158
+
159
+ # PyCharm
160
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
163
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
164
+ #.idea/
165
+
166
+ ### Python Patch ###
167
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
168
+ poetry.toml
169
+
170
+ # ruff
171
+ .ruff_cache/
172
+
173
+ # LSP config files
174
+ pyrightconfig.json
175
+
176
+ # End of https://www.toptal.com/developers/gitignore/api/python
177
+
File without changes
@@ -0,0 +1,11 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 E4crypt3d (https://github.com/E4crypt3d)
4
+
5
+
6
+
7
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
8
+
9
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
10
+
11
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,174 @@
1
+ Metadata-Version: 2.4
2
+ Name: ezextract
3
+ Version: 0.1.1
4
+ Summary: A simple, human-friendly web scraper wrapper using httpx, BeautifulSoup, and Playwright.
5
+ Project-URL: Homepage, https://github.com/E4crypt3d/ezextract
6
+ Project-URL: Bug Tracker, https://github.com/E4crypt3d/ezextract/issues
7
+ Author: E4crypt3d
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Keywords: crawler,data-extraction,httpx,playwright,scraper,webscraping
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.8
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Topic :: Internet :: WWW/HTTP
21
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
+ Requires-Python: >=3.8
23
+ Requires-Dist: beautifulsoup4>=4.12.0
24
+ Requires-Dist: httpx>=0.24.0
25
+ Requires-Dist: playwright>=1.35.0
26
+ Description-Content-Type: text/markdown
27
+
28
+ # Scraper
29
+
30
+ A simple and practical Python web scraper for real‑world websites. It supports normal HTTP scraping, JavaScript rendering via Playwright, rate limiting, pagination, form submission, file downloads, and structured data extraction.
31
+
32
+ Designed to be usable, reliable, and flexible.
33
+
34
+ ---
35
+
36
+ ## Features
37
+
38
+ * HTTP scraping with `httpx`
39
+ * Automatic fallback to browser mode (Playwright) when blocked
40
+ * JavaScript rendering support
41
+ * Rate limiting
42
+ * Parallel fetching
43
+ * Table extraction with `rowspan` / `colspan`
44
+ * Pagination (patterned pages + "Next" button)
45
+ * JSON endpoints
46
+ * Form submission
47
+ * Image and file downloading
48
+ * CSV / JSON export
49
+ * Logging system
50
+
51
+ ---
52
+
53
+ ## Install
54
+
55
+ ```bash
56
+ pip install httpx beautifulsoup4 playwright
57
+ playwright install chromium
58
+ ```
59
+
60
+ ---
61
+
62
+ ## Basic Usage
63
+
64
+ ```python
65
+ from scraper import Scraper
66
+
67
+ s = Scraper(url="https://example.com", debug=True)
68
+ s.fetch()
69
+
70
+ print(s.get_text("h1"))
71
+ print(s.get_links())
72
+ ```
73
+
74
+ ---
75
+
76
+ ## Common Examples
77
+
78
+ ### Fetch a page
79
+
80
+ ```python
81
+ s.fetch("https://example.com")
82
+ ```
83
+
84
+ ### Force JS rendering
85
+
86
+ ```python
87
+ s.fetch("https://site.com", use_browser=True)
88
+ # or
89
+ s.render_js()
90
+ ```
91
+
92
+ ### Extract text
93
+
94
+ ```python
95
+ s.get_text(".title")
96
+ s.get_text_clean(".content")
97
+ ```
98
+
99
+ ### Get links & images
100
+
101
+ ```python
102
+ s.get_links()
103
+ s.get_images()
104
+ ```
105
+
106
+ ### Extract tables
107
+
108
+ ```python
109
+ table = s.get_table()
110
+ ```
111
+
112
+ ### Scrape paginated pages
113
+
114
+ ```python
115
+ s.scrape_pages("https://site.com/page/{}", 5, ".item")
116
+ ```
117
+
118
+ ### Auto "Next" pagination
119
+
120
+ ```python
121
+ s.scrape_auto_next("https://site.com", ".post")
122
+ ```
123
+
124
+ ### Parallel fetch
125
+
126
+ ```python
127
+ urls = ["https://a.com", "https://b.com"]
128
+ s.fetch_multiple(urls, workers=5)
129
+ ```
130
+
131
+ ### JSON API
132
+
133
+ ```python
134
+ s.get_json("https://api.site.com/data")
135
+ ```
136
+
137
+ ### Submit form
138
+
139
+ ```python
140
+ s.submit_form("https://site.com/login", {
141
+ "user": "name",
142
+ "pass": "password"
143
+ })
144
+ ```
145
+
146
+ ### Download files
147
+
148
+ ```python
149
+ s.download_file(url, "file.pdf")
150
+ s.download_images("images/")
151
+ ```
152
+
153
+ ### Export
154
+
155
+ ```python
156
+ s.export_csv(data, "data.csv")
157
+ s.export_json(data, "data.json")
158
+ ```
159
+
160
+ ---
161
+
162
+ ## Close resources
163
+
164
+ ```python
165
+ s.close()
166
+ ```
167
+
168
+ ---
169
+
170
+ ## Notes
171
+
172
+ * Automatically switches to browser mode if blocked
173
+ * Thread‑safe request handling
174
+ * Suitable for large scraping jobs
@@ -0,0 +1,147 @@
1
+ # Scraper
2
+
3
+ A simple and practical Python web scraper for real‑world websites. It supports normal HTTP scraping, JavaScript rendering via Playwright, rate limiting, pagination, form submission, file downloads, and structured data extraction.
4
+
5
+ Designed to be usable, reliable, and flexible.
6
+
7
+ ---
8
+
9
+ ## Features
10
+
11
+ * HTTP scraping with `httpx`
12
+ * Automatic fallback to browser mode (Playwright) when blocked
13
+ * JavaScript rendering support
14
+ * Rate limiting
15
+ * Parallel fetching
16
+ * Table extraction with `rowspan` / `colspan`
17
+ * Pagination (patterned pages + "Next" button)
18
+ * JSON endpoints
19
+ * Form submission
20
+ * Image and file downloading
21
+ * CSV / JSON export
22
+ * Logging system
23
+
24
+ ---
25
+
26
+ ## Install
27
+
28
+ ```bash
29
+ pip install httpx beautifulsoup4 playwright
30
+ playwright install chromium
31
+ ```
32
+
33
+ ---
34
+
35
+ ## Basic Usage
36
+
37
+ ```python
38
+ from scraper import Scraper
39
+
40
+ s = Scraper(url="https://example.com", debug=True)
41
+ s.fetch()
42
+
43
+ print(s.get_text("h1"))
44
+ print(s.get_links())
45
+ ```
46
+
47
+ ---
48
+
49
+ ## Common Examples
50
+
51
+ ### Fetch a page
52
+
53
+ ```python
54
+ s.fetch("https://example.com")
55
+ ```
56
+
57
+ ### Force JS rendering
58
+
59
+ ```python
60
+ s.fetch("https://site.com", use_browser=True)
61
+ # or
62
+ s.render_js()
63
+ ```
64
+
65
+ ### Extract text
66
+
67
+ ```python
68
+ s.get_text(".title")
69
+ s.get_text_clean(".content")
70
+ ```
71
+
72
+ ### Get links & images
73
+
74
+ ```python
75
+ s.get_links()
76
+ s.get_images()
77
+ ```
78
+
79
+ ### Extract tables
80
+
81
+ ```python
82
+ table = s.get_table()
83
+ ```
84
+
85
+ ### Scrape paginated pages
86
+
87
+ ```python
88
+ s.scrape_pages("https://site.com/page/{}", 5, ".item")
89
+ ```
90
+
91
+ ### Auto "Next" pagination
92
+
93
+ ```python
94
+ s.scrape_auto_next("https://site.com", ".post")
95
+ ```
96
+
97
+ ### Parallel fetch
98
+
99
+ ```python
100
+ urls = ["https://a.com", "https://b.com"]
101
+ s.fetch_multiple(urls, workers=5)
102
+ ```
103
+
104
+ ### JSON API
105
+
106
+ ```python
107
+ s.get_json("https://api.site.com/data")
108
+ ```
109
+
110
+ ### Submit form
111
+
112
+ ```python
113
+ s.submit_form("https://site.com/login", {
114
+ "user": "name",
115
+ "pass": "password"
116
+ })
117
+ ```
118
+
119
+ ### Download files
120
+
121
+ ```python
122
+ s.download_file(url, "file.pdf")
123
+ s.download_images("images/")
124
+ ```
125
+
126
+ ### Export
127
+
128
+ ```python
129
+ s.export_csv(data, "data.csv")
130
+ s.export_json(data, "data.json")
131
+ ```
132
+
133
+ ---
134
+
135
+ ## Close resources
136
+
137
+ ```python
138
+ s.close()
139
+ ```
140
+
141
+ ---
142
+
143
+ ## Notes
144
+
145
+ * Automatically switches to browser mode if blocked
146
+ * Thread‑safe request handling
147
+ * Suitable for large scraping jobs
@@ -0,0 +1,601 @@
1
+ import time
2
+ import os
3
+ import httpx
4
+ import threading
5
+ import logging
6
+ from bs4 import BeautifulSoup
7
+ from urllib.parse import urljoin
8
+ from concurrent.futures import ThreadPoolExecutor
9
+ from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
10
+
11
+ from .utils import clean_text, save_csv, save_json
12
+
13
+
14
+ class Scraper:
15
+ def __init__(
16
+ self,
17
+ url=None,
18
+ delay=0.0,
19
+ headers=None,
20
+ max_requests_per_minute=None,
21
+ debug=False,
22
+ strict=False,
23
+ ):
24
+ self._playwright = None
25
+ self._browser = None
26
+ self._context = None
27
+
28
+ self.base_url = url
29
+ self.delay = delay
30
+ self.debug = debug
31
+ self.strict = strict
32
+
33
+ self._state = threading.local()
34
+ self._lock = threading.Lock()
35
+
36
+ # Setup logging
37
+ self.logger = logging.getLogger(__name__)
38
+ if debug:
39
+ self.logger.setLevel(logging.DEBUG)
40
+ else:
41
+ self.logger.setLevel(logging.WARNING)
42
+
43
+ # headers
44
+ head = {
45
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
46
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
47
+ "Chrome/121.0.0.0 Safari/537.36",
48
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
49
+ "Accept-Language": "en-US,en;q=0.9",
50
+ "Referer": "https://www.google.com/",
51
+ "Connection": "keep-alive",
52
+ "Upgrade-Insecure-Requests": "1",
53
+ }
54
+
55
+ if headers:
56
+ head.update({k.lower(): v for k, v in headers.items()})
57
+
58
+ self.client = httpx.Client(headers=head, follow_redirects=True, timeout=15.0)
59
+
60
+ self.last_req = 0
61
+
62
+ if max_requests_per_minute and max_requests_per_minute > 0:
63
+ self.delay = max(self.delay, 60.0 / max_requests_per_minute)
64
+
65
+ @property
66
+ def soup(self):
67
+ """Get the current BeautifulSoup object from thread-local state."""
68
+ return getattr(self._state, "soup", None)
69
+
70
+ @property
71
+ def response(self):
72
+ """Get the current response object from thread-local state."""
73
+ return getattr(self._state, "res", None)
74
+
75
+ def _init_browser(self):
76
+ """Initialize browser if not already initialized."""
77
+ if self._browser:
78
+ return
79
+
80
+ self._playwright = sync_playwright().start()
81
+ self._browser = self._playwright.chromium.launch(headless=True)
82
+ self._context = self._browser.new_context(
83
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
84
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
85
+ "Chrome/121.0.0.0 Safari/537.36",
86
+ locale="en-US",
87
+ )
88
+
89
+ def _close_browser(self):
90
+ """Close browser and cleanup resources."""
91
+ if self._context:
92
+ self._context.close()
93
+ if self._browser:
94
+ self._browser.close()
95
+ if self._playwright:
96
+ self._playwright.stop()
97
+
98
+ self._context = None
99
+ self._browser = None
100
+ self._playwright = None
101
+
102
+ def _wait(self):
103
+ """Enforce rate limiting between requests."""
104
+ with self._lock:
105
+ passed = time.time() - self.last_req
106
+ if passed < self.delay:
107
+ time.sleep(self.delay - passed)
108
+ self.last_req = time.time()
109
+
110
+ def fetch(self, url=None, retries=0, use_browser=False):
111
+ """
112
+ Fetch a URL and parse its content.
113
+
114
+ Args:
115
+ url: URL to fetch (uses base_url if not provided)
116
+ retries: Number of retries on failure
117
+ use_browser: Force using Playwright browser instead of httpx
118
+
119
+ Returns:
120
+ self if successful, None if failed
121
+ """
122
+ target = url or self.base_url
123
+ if not target:
124
+ self.logger.warning("No URL provided to fetch")
125
+ return self
126
+
127
+ if use_browser:
128
+ return self._fetch_browser(target)
129
+
130
+ for i in range(retries + 1):
131
+ try:
132
+ self._wait()
133
+ self.logger.debug(f"Fetching: {target}")
134
+
135
+ res = self.client.get(target)
136
+
137
+ text_low = res.text.lower()
138
+ # Check for common blocking indicators
139
+ blocking_indicators = (
140
+ res.status_code in (403, 429),
141
+ "captcha" in text_low,
142
+ "cloudflare" in text_low,
143
+ "verify you are human" in text_low,
144
+ "access denied" in text_low,
145
+ )
146
+
147
+ if any(blocking_indicators):
148
+ self.logger.warning(
149
+ "Access blocked or soft-blocked -> switching to browser mode"
150
+ )
151
+ return self._fetch_browser(target)
152
+
153
+ if self.strict:
154
+ res.raise_for_status()
155
+
156
+ self._state.res = res
157
+ self._state.soup = BeautifulSoup(res.text, "html.parser")
158
+ return self
159
+
160
+ except httpx.HTTPError as e:
161
+ self.logger.error(f"HTTP error on {target}: {e}")
162
+ if self.strict and i == retries:
163
+ raise
164
+ if i < retries:
165
+ time.sleep(1)
166
+ except Exception as e:
167
+ self.logger.error(f"Error on {target}: {e}")
168
+ if self.strict and i == retries:
169
+ raise
170
+ if i < retries:
171
+ time.sleep(1)
172
+
173
+ return None
174
+
175
+ def _fetch_browser(self, target, wait=1.5):
176
+ """
177
+ Fetch URL using Playwright browser for JavaScript-heavy pages.
178
+
179
+ Args:
180
+ target: URL to fetch
181
+ wait: Time to wait after page load (seconds)
182
+
183
+ Returns:
184
+ self if successful
185
+ """
186
+ self.logger.debug(f"[browser] fetching: {target}")
187
+
188
+ self._init_browser()
189
+
190
+ page = self._context.new_page()
191
+ try:
192
+ try:
193
+ page.goto(target, wait_until="networkidle", timeout=15000)
194
+ except PlaywrightTimeout:
195
+ self.logger.warning(
196
+ f"Playwright timeout for {target}, continuing anyway"
197
+ )
198
+
199
+ time.sleep(wait)
200
+ html = page.content()
201
+ except Exception as e:
202
+ self.logger.error(f"Browser error fetching {target}: {e}")
203
+ return None
204
+ finally:
205
+ page.close()
206
+
207
+ # Create mock response object
208
+ mock_response = type(
209
+ "Response", (), {"url": target, "text": html, "status_code": 200}
210
+ )()
211
+ self._state.res = mock_response
212
+ self._state.soup = BeautifulSoup(html, "html.parser")
213
+ return self
214
+
215
+ def render_js(self, wait=2):
216
+ """
217
+ Re-fetch current page with JavaScript rendering enabled.
218
+
219
+ Args:
220
+ wait: Time to wait after page load (seconds)
221
+
222
+ Returns:
223
+ self if successful
224
+ """
225
+ if not self.response:
226
+ self.logger.warning("No current response to render")
227
+ return self
228
+ return self._fetch_browser(str(self.response.url), wait=wait)
229
+
230
+ def get_text(self, selector):
231
+ """
232
+ Extract text from first element matching selector.
233
+
234
+ Args:
235
+ selector: CSS selector
236
+
237
+ Returns:
238
+ Text content or empty string
239
+ """
240
+ if not self.soup:
241
+ self.logger.warning("No soup object available")
242
+ return ""
243
+ el = self.soup.select_one(selector)
244
+ return el.get_text(strip=True) if el else ""
245
+
246
+ def get_text_clean(self, selector):
247
+ """Extract and clean text from element."""
248
+ return clean_text(self.get_text(selector))
249
+
250
+ def get_links(self):
251
+ """Extract all unique links from current page."""
252
+ if not self.soup:
253
+ self.logger.warning("No soup object available")
254
+ return []
255
+
256
+ links = set()
257
+ for a in self.soup.find_all("a", href=True):
258
+ try:
259
+ href = a.get("href", "").strip()
260
+ if href: # Skip empty hrefs
261
+ full_url = urljoin(str(self.response.url), href)
262
+ links.add(full_url)
263
+ except Exception as e:
264
+ self.logger.debug(f"Error processing link: {e}")
265
+ continue
266
+
267
+ return list(links)
268
+
269
+ def get_images(self):
270
+ """Extract all unique image URLs from current page."""
271
+ if not self.soup:
272
+ self.logger.warning("No soup object available")
273
+ return []
274
+
275
+ images = set()
276
+ for img in self.soup.find_all("img", src=True):
277
+ try:
278
+ src = img.get("src", "").strip()
279
+ if src: # Skip empty src attributes
280
+ full_url = urljoin(str(self.response.url), src)
281
+ images.add(full_url)
282
+ except Exception as e:
283
+ self.logger.debug(f"Error processing image: {e}")
284
+ continue
285
+
286
+ return list(images)
287
+
288
+ def get_json(self, url=None):
289
+ """
290
+ Fetch and parse JSON from URL.
291
+
292
+ Args:
293
+ url: URL to fetch (uses base_url if not provided)
294
+
295
+ Returns:
296
+ Parsed JSON data
297
+ """
298
+ target = url or self.base_url
299
+ if not target:
300
+ raise ValueError("No URL provided for JSON fetch")
301
+
302
+ self._wait()
303
+ try:
304
+ r = self.client.get(target)
305
+ r.raise_for_status()
306
+ return r.json()
307
+ except httpx.HTTPError as e:
308
+ self.logger.error(f"HTTP error fetching JSON from {target}: {e}")
309
+ raise
310
+ except ValueError as e:
311
+ self.logger.error(f"Invalid JSON from {target}: {e}")
312
+ raise
313
+ finally:
314
+ self.last_req = time.time()
315
+
316
+ def get_table(self, selector=None):
317
+ """
318
+ Extract table data with support for colspan/rowspan.
319
+
320
+ Args:
321
+ selector: CSS selector for table(s), defaults to ".wikitable"
322
+
323
+ Returns:
324
+ List of lists representing table rows
325
+ """
326
+ if not self.soup:
327
+ self.logger.warning("No soup object available")
328
+ return []
329
+
330
+ tables = (
331
+ self.soup.select(selector)
332
+ if selector
333
+ else self.soup.select("table.wikitable")
334
+ )
335
+
336
+ if not tables:
337
+ self.logger.debug(f"No tables found with selector: {selector}")
338
+ return []
339
+
340
+ # Find table with most rows
341
+ table = max(tables, key=lambda t: len(t.find_all("tr"))) if tables else None
342
+ if not table:
343
+ return []
344
+
345
+ rows = table.find_all("tr")
346
+ matrix = []
347
+ active_spans = {}
348
+ max_cols = 0
349
+
350
+ for tr in rows:
351
+ cells = tr.find_all(["td", "th"])
352
+ row = []
353
+ col = 0
354
+ cell_i = 0
355
+
356
+ while cell_i < len(cells) or col in active_spans:
357
+ if col in active_spans:
358
+ val, remaining = active_spans[col]
359
+ row.append(val)
360
+ if remaining > 1:
361
+ active_spans[col][1] -= 1
362
+ else:
363
+ del active_spans[col]
364
+ col += 1
365
+ continue
366
+
367
+ if cell_i >= len(cells):
368
+ break
369
+
370
+ cell = cells[cell_i]
371
+ cell_i += 1
372
+
373
+ try:
374
+ rowspan = int(cell.get("rowspan", 1))
375
+ colspan = int(cell.get("colspan", 1))
376
+ except (ValueError, TypeError):
377
+ rowspan, colspan = 1, 1
378
+
379
+ value = cell.get_text(" ", strip=True)
380
+
381
+ if rowspan > 1:
382
+ for c in range(colspan):
383
+ active_spans[col + c] = [value, rowspan - 1]
384
+
385
+ for _ in range(colspan):
386
+ row.append(value)
387
+ col += 1
388
+
389
+ max_cols = max(max_cols, len(row))
390
+ matrix.append(row)
391
+
392
+ # Pad rows with empty strings
393
+ for r in matrix:
394
+ if len(r) < max_cols:
395
+ r.extend([""] * (max_cols - len(r)))
396
+
397
+ return matrix
398
+
399
+ def scrape_pages(self, url_pattern, pages, selector):
400
+ """
401
+ Scrape multiple pages with numbered URL pattern.
402
+
403
+ Args:
404
+ url_pattern: URL pattern with {}, e.g., "https://example.com/page/{}"
405
+ pages: Number of pages to scrape
406
+ selector: CSS selector for elements to extract
407
+
408
+ Returns:
409
+ List of extracted text values
410
+ """
411
+ if pages < 1:
412
+ raise ValueError("pages must be >= 1")
413
+
414
+ results = []
415
+ for i in range(1, pages + 1):
416
+ self.logger.debug(f"Scraping page {i}/{pages}")
417
+ try:
418
+ formatted_url = url_pattern.format(i)
419
+ if self.fetch(formatted_url):
420
+ results.extend(
421
+ [el.get_text(strip=True) for el in self.soup.select(selector)]
422
+ )
423
+ except Exception as e:
424
+ self.logger.error(f"Error scraping page {i}: {e}")
425
+ if self.strict:
426
+ raise
427
+ return results
428
+
429
+ def scrape_auto_next(self, url, selector, max_pages=10):
430
+ """
431
+ Scrape pages by following "Next" button.
432
+
433
+ Args:
434
+ url: Starting URL
435
+ selector: CSS selector for elements to extract
436
+ max_pages: Maximum pages to scrape
437
+
438
+ Returns:
439
+ List of extracted text values
440
+ """
441
+ if max_pages < 1:
442
+ raise ValueError("max_pages must be >= 1")
443
+
444
+ data, curr = [], url
445
+ for page_num in range(max_pages):
446
+ self.logger.debug(f"Scraping auto-next page {page_num + 1}/{max_pages}")
447
+
448
+ if not self.fetch(curr):
449
+ break
450
+
451
+ data.extend([el.get_text(strip=True) for el in self.soup.select(selector)])
452
+
453
+ # IMPROVED: More resilient "Next" button detection
454
+ nxt = (
455
+ self.soup.find("a", string=lambda t: t and "next" in t.lower().strip())
456
+ or self.soup.find("a", attrs={"rel": "next"})
457
+ or self.soup.select_one("li.next a")
458
+ or self.soup.select_one("a.next")
459
+ )
460
+
461
+ if nxt and nxt.get("href"):
462
+ curr = urljoin(str(self.response.url), nxt["href"])
463
+ else:
464
+ self.logger.debug("No 'Next' button found, stopping pagination")
465
+ break
466
+
467
+ return data
468
+
469
+ def fetch_multiple(self, urls, workers=5):
470
+ """
471
+ Fetch multiple URLs in parallel.
472
+
473
+ Args:
474
+ urls: List of URLs to fetch
475
+ workers: Number of worker threads
476
+
477
+ Returns:
478
+ List of tuples (url, success_status)
479
+ """
480
+ if not urls:
481
+ self.logger.warning("No URLs provided for parallel fetch")
482
+ return []
483
+
484
+ if workers < 1:
485
+ raise ValueError("workers must be >= 1")
486
+
487
+ with ThreadPoolExecutor(max_workers=workers) as pool:
488
+ return list(pool.map(lambda u: (u, self.fetch(u) is not None), urls))
489
+
490
+ def submit_form(self, url, data):
491
+ """
492
+ Submit form data via POST.
493
+
494
+ Args:
495
+ url: Form endpoint URL
496
+ data: Dictionary of form data
497
+
498
+ Returns:
499
+ self
500
+ """
501
+ if not url or not data:
502
+ raise ValueError("url and data are required")
503
+
504
+ self._wait()
505
+ try:
506
+ res = self.client.post(url, data=data)
507
+ res.raise_for_status()
508
+ self._state.res = res
509
+ self._state.soup = BeautifulSoup(res.text, "html.parser")
510
+ except httpx.HTTPError as e:
511
+ self.logger.error(f"HTTP error submitting form to {url}: {e}")
512
+ if self.strict:
513
+ raise
514
+ return self
515
+
516
+ def download_file(self, url, dest):
517
+ """
518
+ Download file from URL.
519
+
520
+ Args:
521
+ url: File URL
522
+ dest: Destination file path
523
+ """
524
+ if not url or not dest:
525
+ raise ValueError("url and dest are required")
526
+
527
+ try:
528
+ os.makedirs(os.path.dirname(dest) or ".", exist_ok=True)
529
+ with self.client.stream("GET", url) as r:
530
+ r.raise_for_status()
531
+ with open(dest, "wb") as f:
532
+ for chunk in r.iter_bytes():
533
+ f.write(chunk)
534
+ self.logger.debug(f"Downloaded: {dest}")
535
+ except httpx.HTTPError as e:
536
+ self.logger.error(f"HTTP error downloading {url}: {e}")
537
+ raise
538
+ except IOError as e:
539
+ self.logger.error(f"Error writing to {dest}: {e}")
540
+ raise
541
+
542
+ def download_images(self, folder="images/"):
543
+ """
544
+ Download all images from current page.
545
+
546
+ Args:
547
+ folder: Destination folder for images
548
+ """
549
+ if not folder:
550
+ raise ValueError("folder is required")
551
+
552
+ images = self.get_images()
553
+ if not images:
554
+ self.logger.warning("No images found to download")
555
+ return
556
+
557
+ for i, url in enumerate(images):
558
+ try:
559
+ ext = url.split(".")[-1].split("?")[0][:3]
560
+ if len(ext) > 3 or not ext:
561
+ ext = "jpg"
562
+ dest = os.path.join(folder, f"img_{i}.{ext}")
563
+ self.download_file(url, dest)
564
+ except Exception as e:
565
+ self.logger.error(f"Error downloading image {i} from {url}: {e}")
566
+ if self.strict:
567
+ raise
568
+
569
+ def list_selectors(self):
570
+ """List available HTML tags, IDs, and classes for debugging."""
571
+ if not self.soup:
572
+ self.logger.warning("No soup object available")
573
+ return
574
+
575
+ tags = list(set(el.name for el in self.soup.find_all()))[:15]
576
+ ids = list(set(el["id"] for el in self.soup.find_all(id=True)))[:15]
577
+ classes = list(
578
+ set(c for el in self.soup.find_all(class_=True) for c in el["class"])
579
+ )[:15]
580
+
581
+ print(f"tags: {tags}\nids: {ids}\nclasses: {classes}")
582
+
583
+ def export_csv(self, data, path):
584
+ """Export data to CSV file."""
585
+ if not path:
586
+ raise ValueError("path is required")
587
+ save_csv(data, path)
588
+ self.logger.debug(f"Exported CSV: {path}")
589
+
590
+ def export_json(self, data, path):
591
+ """Export data to JSON file."""
592
+ if not path:
593
+ raise ValueError("path is required")
594
+ save_json(data, path)
595
+ self.logger.debug(f"Exported JSON: {path}")
596
+
597
+ def close(self):
598
+ """Close client and cleanup resources."""
599
+ self.client.close()
600
+ self._close_browser()
601
+ self.logger.debug("Scraper closed")
@@ -0,0 +1,27 @@
1
+ import os
2
+ import csv
3
+ import json
4
+
5
+
6
+ def clean_text(text):
7
+ return " ".join(text.split())
8
+
9
+
10
+ def save_csv(data, path):
11
+ # exports data to csv
12
+ os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
13
+ with open(path, "w", newline="", encoding="utf-8") as f:
14
+ w = csv.writer(f)
15
+ if data and isinstance(data[0], dict):
16
+ w.writerow(data[0].keys())
17
+ for r in data:
18
+ w.writerow(r.values())
19
+ else:
20
+ w.writerows(data)
21
+
22
+
23
+ def save_json(data, path):
24
+ # exports data to json
25
+ os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
26
+ with open(path, "w", encoding="utf-8") as f:
27
+ json.dump(data, f, indent=4)
@@ -0,0 +1,40 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "ezextract"
7
+ version = "0.1.1"
8
+ description = "A simple, human-friendly web scraper wrapper using httpx, BeautifulSoup, and Playwright."
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ license = {text = "MIT"}
12
+ authors = [
13
+ { name = "E4crypt3d" }
14
+ ]
15
+ keywords = ["scraper", "webscraping", "httpx", "playwright", "crawler", "data-extraction"]
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Intended Audience :: Developers",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.8",
21
+ "Programming Language :: Python :: 3.9",
22
+ "Programming Language :: Python :: 3.10",
23
+ "Programming Language :: Python :: 3.11",
24
+ "License :: OSI Approved :: MIT License",
25
+ "Operating System :: OS Independent",
26
+ "Topic :: Internet :: WWW/HTTP",
27
+ "Topic :: Software Development :: Libraries :: Python Modules",
28
+ ]
29
+ dependencies = [
30
+ "httpx>=0.24.0",
31
+ "beautifulsoup4>=4.12.0",
32
+ "playwright>=1.35.0",
33
+ ]
34
+
35
+ [project.urls]
36
+ "Homepage" = "https://github.com/E4crypt3d/ezextract"
37
+ "Bug Tracker" = "https://github.com/E4crypt3d/ezextract/issues"
38
+
39
+ [tool.hatch.build.targets.wheel]
40
+ packages = ["ezextract"]