pyscrappy 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. pyscrappy-1.0.0/.github/workflows/ci.yml +41 -0
  2. pyscrappy-1.0.0/.github/workflows/python-publish.yml +24 -0
  3. pyscrappy-1.0.0/.gitignore +9 -0
  4. pyscrappy-1.0.0/CONTRIBUTING.md +312 -0
  5. pyscrappy-1.0.0/LICENSE +21 -0
  6. pyscrappy-1.0.0/PKG-INFO +262 -0
  7. pyscrappy-1.0.0/PyScrappy.png +0 -0
  8. pyscrappy-1.0.0/README.md +223 -0
  9. pyscrappy-1.0.0/docs/evidence-cover-sheet.md +144 -0
  10. pyscrappy-1.0.0/pyproject.toml +67 -0
  11. pyscrappy-1.0.0/src/pyscrappy/__init__.py +131 -0
  12. pyscrappy-1.0.0/src/pyscrappy/core/__init__.py +27 -0
  13. pyscrappy-1.0.0/src/pyscrappy/core/base.py +76 -0
  14. pyscrappy-1.0.0/src/pyscrappy/core/browser.py +110 -0
  15. pyscrappy-1.0.0/src/pyscrappy/core/config.py +47 -0
  16. pyscrappy-1.0.0/src/pyscrappy/core/exceptions.py +31 -0
  17. pyscrappy-1.0.0/src/pyscrappy/core/http.py +140 -0
  18. pyscrappy-1.0.0/src/pyscrappy/core/models.py +84 -0
  19. pyscrappy-1.0.0/src/pyscrappy/generic/__init__.py +3 -0
  20. pyscrappy-1.0.0/src/pyscrappy/generic/extractors.py +204 -0
  21. pyscrappy-1.0.0/src/pyscrappy/generic/pagination.py +84 -0
  22. pyscrappy-1.0.0/src/pyscrappy/generic/scraper.py +229 -0
  23. pyscrappy-1.0.0/src/pyscrappy/py.typed +0 -0
  24. pyscrappy-1.0.0/src/pyscrappy/scrapers/__init__.py +37 -0
  25. pyscrappy-1.0.0/src/pyscrappy/scrapers/alibaba.py +149 -0
  26. pyscrappy-1.0.0/src/pyscrappy/scrapers/amazon.py +162 -0
  27. pyscrappy-1.0.0/src/pyscrappy/scrapers/flipkart.py +149 -0
  28. pyscrappy-1.0.0/src/pyscrappy/scrapers/image_search.py +170 -0
  29. pyscrappy-1.0.0/src/pyscrappy/scrapers/imdb.py +223 -0
  30. pyscrappy-1.0.0/src/pyscrappy/scrapers/instagram.py +253 -0
  31. pyscrappy-1.0.0/src/pyscrappy/scrapers/linkedin.py +148 -0
  32. pyscrappy-1.0.0/src/pyscrappy/scrapers/news.py +230 -0
  33. pyscrappy-1.0.0/src/pyscrappy/scrapers/snapdeal.py +130 -0
  34. pyscrappy-1.0.0/src/pyscrappy/scrapers/soundcloud.py +184 -0
  35. pyscrappy-1.0.0/src/pyscrappy/scrapers/spotify.py +234 -0
  36. pyscrappy-1.0.0/src/pyscrappy/scrapers/stock.py +222 -0
  37. pyscrappy-1.0.0/src/pyscrappy/scrapers/swiggy.py +215 -0
  38. pyscrappy-1.0.0/src/pyscrappy/scrapers/twitter.py +174 -0
  39. pyscrappy-1.0.0/src/pyscrappy/scrapers/wikipedia.py +195 -0
  40. pyscrappy-1.0.0/src/pyscrappy/scrapers/youtube.py +211 -0
  41. pyscrappy-1.0.0/src/pyscrappy/scrapers/zomato.py +219 -0
  42. pyscrappy-1.0.0/tests/__init__.py +0 -0
  43. pyscrappy-1.0.0/tests/test_core/__init__.py +0 -0
  44. pyscrappy-1.0.0/tests/test_core/test_base.py +156 -0
  45. pyscrappy-1.0.0/tests/test_core/test_browser.py +156 -0
  46. pyscrappy-1.0.0/tests/test_core/test_config.py +60 -0
  47. pyscrappy-1.0.0/tests/test_core/test_exceptions.py +59 -0
  48. pyscrappy-1.0.0/tests/test_core/test_http.py +241 -0
  49. pyscrappy-1.0.0/tests/test_core/test_models.py +128 -0
  50. pyscrappy-1.0.0/tests/test_generic/__init__.py +0 -0
  51. pyscrappy-1.0.0/tests/test_generic/test_extractors.py +357 -0
  52. pyscrappy-1.0.0/tests/test_generic/test_pagination.py +139 -0
  53. pyscrappy-1.0.0/tests/test_generic/test_scraper.py +322 -0
  54. pyscrappy-1.0.0/tests/test_init.py +54 -0
  55. pyscrappy-1.0.0/tests/test_scrapers/__init__.py +0 -0
  56. pyscrappy-1.0.0/tests/test_scrapers/test_ecommerce.py +249 -0
  57. pyscrappy-1.0.0/tests/test_scrapers/test_food.py +219 -0
  58. pyscrappy-1.0.0/tests/test_scrapers/test_imdb.py +197 -0
  59. pyscrappy-1.0.0/tests/test_scrapers/test_music.py +238 -0
  60. pyscrappy-1.0.0/tests/test_scrapers/test_news.py +223 -0
  61. pyscrappy-1.0.0/tests/test_scrapers/test_other.py +223 -0
  62. pyscrappy-1.0.0/tests/test_scrapers/test_social.py +374 -0
  63. pyscrappy-1.0.0/tests/test_scrapers/test_stock.py +214 -0
  64. pyscrappy-1.0.0/tests/test_scrapers/test_wikipedia.py +155 -0
@@ -0,0 +1,41 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ lint:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+ - uses: actions/setup-python@v5
15
+ with:
16
+ python-version: "3.12"
17
+ - run: pip install ruff
18
+ - run: ruff check src/
19
+
20
+ test:
21
+ runs-on: ubuntu-latest
22
+ strategy:
23
+ matrix:
24
+ python-version: ["3.9", "3.11", "3.12", "3.13"]
25
+ steps:
26
+ - uses: actions/checkout@v4
27
+ - uses: actions/setup-python@v5
28
+ with:
29
+ python-version: ${{ matrix.python-version }}
30
+ - run: pip install -e '.[all]' pytest
31
+ - run: pytest tests/ -v
32
+
33
+ build:
34
+ runs-on: ubuntu-latest
35
+ steps:
36
+ - uses: actions/checkout@v4
37
+ - uses: actions/setup-python@v5
38
+ with:
39
+ python-version: "3.12"
40
+ - run: pip install build
41
+ - run: python -m build
@@ -0,0 +1,24 @@
1
+ name: Upload Python Package
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ deploy:
9
+ runs-on: ubuntu-latest
10
+ permissions:
11
+ id-token: write
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+ - uses: actions/setup-python@v5
15
+ with:
16
+ python-version: "3.12"
17
+ - name: Install build tools
18
+ run: pip install build
19
+ - name: Build package
20
+ run: python -m build
21
+ - name: Publish to PyPI
22
+ uses: pypa/gh-action-pypi-publish@release/v1
23
+ with:
24
+ password: ${{ secrets.PYPI_API_TOKEN }}
@@ -0,0 +1,9 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .pytest_cache/
8
+ *.egg
9
+ .mypy_cache/
@@ -0,0 +1,312 @@
1
+ # Contributing to PyScrappy
2
+
3
+ Thank you for your interest in contributing to PyScrappy! We welcome contributions of all kinds — from bug fixes and documentation improvements to new scrapers and core features.
4
+
5
+ ## Checklist before submitting a PR
6
+
7
+ Here are the core requirements for any PR submitted to PyScrappy:
8
+
9
+ - [ ] **Keep scope isolated** — your changes should address 1 specific problem at a time
10
+ - [ ] **Add tests** — adding at least 1 test is a hard requirement — [see details](#adding-tests)
11
+ - [ ] **Ensure your PR passes all checks:**
12
+ - [ ] Unit tests — `pytest tests/ -v`
13
+ - [ ] Linting — `ruff check src/`
14
+
15
+ ## Quick Start
16
+
17
+ ### 1. Setup Your Local Development Environment
18
+
19
+ ```sh
20
+ # Fork the repository on GitHub (click the Fork button at https://github.com/mldsveda/PyScrappy)
21
+ # Then clone your fork locally
22
+ git clone https://github.com/YOUR_USERNAME/PyScrappy.git
23
+ cd PyScrappy
24
+
25
+ # Create a new branch for your feature
26
+ git checkout -b your-feature-branch
27
+
28
+ # Install the package in editable mode with all extras
29
+ pip install -e '.[all]'
30
+
31
+ # Install development tools
32
+ pip install pytest ruff mypy
33
+
34
+ # Verify your setup works
35
+ pytest tests/ -v
36
+ ```
37
+
38
+ That's it! Your local development environment is ready.
39
+
40
+ ### 2. Development Workflow
41
+
42
+ Here's the recommended workflow for making changes:
43
+
44
+ ```sh
45
+ # Make your changes to the code
46
+ # ...
47
+
48
+ # Run linting to catch issues early
49
+ ruff check src/
50
+
51
+ # Run the full test suite
52
+ pytest tests/ -v
53
+
54
+ # Commit your changes
55
+ git add .
56
+ git commit -m "Your descriptive commit message"
57
+
58
+ # Push and create a PR
59
+ git push origin your-feature-branch
60
+ ```
61
+
62
+ ## Adding Tests
63
+
64
+ Adding at least 1 test is a **hard requirement** for all PRs.
65
+
66
+ ### Where to Add Tests
67
+
68
+ | What you changed | Where to add tests |
69
+ |---|---|
70
+ | `src/pyscrappy/core/` | `tests/test_core/` |
71
+ | `src/pyscrappy/generic/` | `tests/test_generic/` |
72
+ | `src/pyscrappy/scrapers/` | `tests/test_scrapers/` |
73
+ | Package-level (`__init__.py`) | `tests/test_init.py` |
74
+
75
+ ### File Naming Convention
76
+
77
+ The `tests/` directory mirrors the structure of `src/pyscrappy/`:
78
+
79
+ | Source file | Test file |
80
+ |---|---|
81
+ | `src/pyscrappy/core/config.py` | `tests/test_core/test_config.py` |
82
+ | `src/pyscrappy/core/http.py` | `tests/test_core/test_http.py` |
83
+ | `src/pyscrappy/generic/extractors.py` | `tests/test_generic/test_extractors.py` |
84
+ | `src/pyscrappy/scrapers/wikipedia.py` | `tests/test_scrapers/test_wikipedia.py` |
85
+
86
+ ### Key Testing Principles
87
+
88
+ - **Mock HTTP calls** — never make real network requests in tests. Use `unittest.mock.MagicMock` to mock the `_http` attribute on scrapers.
89
+ - **Test parsing logic** — provide realistic sample HTML/JSON and verify the scraper extracts the correct fields.
90
+ - **Test edge cases** — empty responses, missing fields, malformed HTML.
91
+ - **Test validation** — ensure proper errors are raised for invalid arguments.
92
+
93
+ ### Example Test
94
+
95
+ ```python
96
+ from unittest.mock import MagicMock
97
+ from pyscrappy.scrapers.wikipedia import WikipediaScraper
98
+
99
+ SAMPLE_HTML = """
100
+ <html><body>
101
+ <div id="mw-content-text">
102
+ <div class="mw-parser-output">
103
+ <p>Python is a high-level programming language.</p>
104
+ </div>
105
+ </div>
106
+ </body></html>
107
+ """
108
+
109
+ def test_wikipedia_scrape_returns_paragraphs():
110
+ """Test that WikipediaScraper extracts paragraph text."""
111
+ scraper = WikipediaScraper()
112
+ mock_http = MagicMock()
113
+ mock_http.get_html.return_value = SAMPLE_HTML
114
+ scraper._http = mock_http
115
+
116
+ result = scraper.scrape(query="Python", mode="paragraphs")
117
+
118
+ assert len(result.data) > 0
119
+ assert result.data[0]["type"] == "paragraph"
120
+ assert "Python" in result.data[0]["text"]
121
+ scraper.close()
122
+ ```
123
+
124
+ ## Running Tests and Checks
125
+
126
+ ### Running Unit Tests
127
+
128
+ Run the full test suite:
129
+
130
+ ```sh
131
+ pytest tests/ -v
132
+ ```
133
+
134
+ Run a specific test file:
135
+
136
+ ```sh
137
+ pytest tests/test_core/test_http.py -v
138
+ ```
139
+
140
+ Run a specific test:
141
+
142
+ ```sh
143
+ pytest tests/test_scrapers/test_wikipedia.py::TestWikipediaScraperFull::test_full_mode -v
144
+ ```
145
+
146
+ ### Running Linting
147
+
148
+ Run Ruff linting (matches CI):
149
+
150
+ ```sh
151
+ ruff check src/
152
+ ```
153
+
154
+ Auto-fix linting issues:
155
+
156
+ ```sh
157
+ ruff check src/ --fix
158
+ ```
159
+
160
+ ### Running Type Checks (optional)
161
+
162
+ ```sh
163
+ mypy src/pyscrappy/
164
+ ```
165
+
166
+ ### CI Compatibility
167
+
168
+ To ensure your changes will pass CI, run the same checks locally:
169
+
170
+ ```sh
171
+ # These match the GitHub Actions workflows exactly
172
+ ruff check src/
173
+ pytest tests/ -v
174
+ ```
175
+
176
+ CI runs tests across Python 3.9, 3.11, 3.12, and 3.13.
177
+
178
+ ## Project Structure
179
+
180
+ ```
181
+ PyScrappy/
182
+ ├── src/pyscrappy/
183
+ │ ├── __init__.py # Package exports and convenience scrape() function
184
+ │ ├── core/ # Core infrastructure
185
+ │ │ ├── base.py # BaseScraper abstract class
186
+ │ │ ├── browser.py # Playwright browser manager
187
+ │ │ ├── config.py # ScraperConfig dataclass
188
+ │ │ ├── exceptions.py # Custom exception hierarchy
189
+ │ │ ├── http.py # HTTP client with retries/rate-limiting
190
+ │ │ └── models.py # ScrapeResult, ScrapeMetadata, ScrapeError
191
+ │ ├── generic/ # GenericScraper (works on any URL)
192
+ │ │ ├── scraper.py # Main GenericScraper class
193
+ │ │ ├── extractors.py # Metadata, Text, Link, Image, Table extractors
194
+ │ │ └── pagination.py # Auto-pagination detection
195
+ │ └── scrapers/ # Site-specific scrapers (16 total)
196
+ │ ├── wikipedia.py
197
+ │ ├── imdb.py
198
+ │ ├── stock.py
199
+ │ └── ...
200
+ ├── tests/
201
+ │ ├── test_core/ # Tests for core/
202
+ │ ├── test_generic/ # Tests for generic/
203
+ │ ├── test_scrapers/ # Tests for scrapers/
204
+ │ └── test_init.py # Package-level import tests
205
+ └── pyproject.toml # Build config, dependencies, tool settings
206
+ ```
207
+
208
+ ## Adding a New Scraper
209
+
210
+ Want to add support for a new website? Here's how:
211
+
212
+ ### 1. Create the scraper
213
+
214
+ Create a new file in `src/pyscrappy/scrapers/`, e.g. `mysite.py`:
215
+
216
+ ```python
217
+ from __future__ import annotations
218
+ from typing import Any
219
+ from pyscrappy.core.base import BaseScraper
220
+ from pyscrappy.core.models import ScrapeMetadata, ScrapeResult
221
+
222
+ class MySiteScraper(BaseScraper):
223
+ name = "mysite"
224
+
225
+ def scrape(self, query: str, **kwargs: object) -> ScrapeResult:
226
+ url = f"https://mysite.com/search?q={query}"
227
+ soup = self.fetch_and_parse(url)
228
+
229
+ items: list[dict[str, Any]] = []
230
+ for card in soup.select(".result-card"):
231
+ items.append({
232
+ "title": card.select_one("h2").get_text(strip=True),
233
+ "url": card.select_one("a")["href"],
234
+ })
235
+
236
+ return ScrapeResult(
237
+ data=items,
238
+ metadata=ScrapeMetadata(source_urls=[url], scraper=self.name),
239
+ )
240
+ ```
241
+
242
+ ### 2. Register the export
243
+
244
+ Add your scraper to:
245
+ - `src/pyscrappy/scrapers/__init__.py`
246
+ - `src/pyscrappy/__init__.py` (import + add to `__all__`)
247
+
248
+ ### 3. Add tests
249
+
250
+ Create `tests/test_scrapers/test_mysite.py` with mock HTML and assertions.
251
+
252
+ ### 4. Submit your PR
253
+
254
+ ## Code Quality Standards
255
+
256
+ - **Style** — enforced by [Ruff](https://docs.astral.sh/ruff/) with a 100-character line length
257
+ - **Type hints** — all public APIs should have type annotations
258
+ - **Python version** — must be compatible with Python 3.9+
259
+ - **No real HTTP calls in tests** — always mock network requests
260
+ - **Match existing patterns** — follow the conventions you see in existing scrapers
261
+
262
+ ## Common Issues and Solutions
263
+
264
+ ### Linting Failures
265
+
266
+ If `ruff check src/` fails:
267
+ - Run `ruff check src/ --fix` to auto-fix most issues
268
+ - Check import ordering (Ruff enforces isort-compatible ordering)
269
+ - Ensure lines are under 100 characters
270
+
271
+ ### Test Failures
272
+
273
+ If `pytest tests/ -v` fails:
274
+ - Check if you broke existing functionality
275
+ - Ensure tests use mocks, not real API calls
276
+ - Check test file naming conventions (`test_*.py`)
277
+ - Make sure `__init__.py` exists in test directories
278
+
279
+ ### Import Errors
280
+
281
+ If you get `ModuleNotFoundError: No module named 'pyscrappy'`:
282
+ ```sh
283
+ pip install -e '.[all]'
284
+ ```
285
+
286
+ ## Submitting Your PR
287
+
288
+ 1. **Push your branch:** `git push origin your-feature-branch`
289
+ 2. **Create a PR:** go to GitHub and create a pull request against `main`
290
+ 3. **Fill out the description:** clearly explain what your changes do and why
291
+ 4. **Wait for CI:** ensure all checks pass (lint, tests across Python versions, build)
292
+ 5. **Address feedback:** make requested changes and push updates
293
+ 6. **Merge:** once approved, your PR will be merged!
294
+
295
+ ## Getting Help
296
+
297
+ If you need help:
298
+
299
+ - [Create an issue](https://github.com/mldsveda/PyScrappy/issues)
300
+ - Check existing [discussions](https://github.com/mldsveda/PyScrappy/issues) for similar questions
301
+
302
+ ## What to Contribute
303
+
304
+ Looking for ideas? Check out:
305
+
306
+ - **Bug fixes** — check [open issues](https://github.com/mldsveda/PyScrappy/issues)
307
+ - **New scrapers** — add support for a website you use
308
+ - **Test coverage** — improve tests for existing scrapers
309
+ - **Documentation** — improve docstrings, examples, or guides
310
+ - **Core improvements** — better error handling, caching, async support
311
+
312
+ Thank you for contributing to PyScrappy!
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Vedant Tibrewal, Vedaant Singh.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,262 @@
1
+ Metadata-Version: 2.4
2
+ Name: pyscrappy
3
+ Version: 1.0.0
4
+ Summary: A robust, all-in-one Python web scraping toolkit
5
+ Project-URL: Homepage, https://github.com/mldsveda/PyScrappy
6
+ Project-URL: Repository, https://github.com/mldsveda/PyScrappy
7
+ Project-URL: Issues, https://github.com/mldsveda/PyScrappy/issues
8
+ Author: Vedaant Singh
9
+ Author-email: Vedant Tibrewal <mlds93363@gmail.com>
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: beautifulsoup,crawler,data-extraction,httpx,playwright,scraping,web-scraping
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Programming Language :: Python :: 3.13
24
+ Classifier: Topic :: Internet :: WWW/HTTP
25
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
26
+ Classifier: Typing :: Typed
27
+ Requires-Python: >=3.9
28
+ Requires-Dist: beautifulsoup4>=4.12
29
+ Requires-Dist: httpx>=0.24
30
+ Requires-Dist: lxml>=4.9
31
+ Provides-Extra: all
32
+ Requires-Dist: pandas>=1.5; extra == 'all'
33
+ Requires-Dist: playwright>=1.40; extra == 'all'
34
+ Provides-Extra: browser
35
+ Requires-Dist: playwright>=1.40; extra == 'browser'
36
+ Provides-Extra: dataframe
37
+ Requires-Dist: pandas>=1.5; extra == 'dataframe'
38
+ Description-Content-Type: text/markdown
39
+
40
+ <div align="center">
41
+ <img src="https://raw.githubusercontent.com/mldsveda/PyScrappy/main/PyScrappy.png">
42
+ <hr>
43
+ </div>
44
+
45
+ ## PyScrappy: robust, all-in-one Python web scraping toolkit
46
+
47
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
48
+ [![PyPI Latest Release](https://img.shields.io/pypi/v/PyScrappy.svg)](https://pypi.org/project/PyScrappy/)
49
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/mldsveda/PyScrappy/blob/main/LICENSE)
50
+
51
+ PyScrappy is a Python toolkit for web scraping that works out of the box. Point it at any URL and get structured data back — or use built-in scrapers for Wikipedia, IMDB, Yahoo Finance, news feeds, and more.
52
+
53
+ ### Key features
54
+
55
+ - **Generic scraper** — give it any URL, get back structured text, links, images, tables, and metadata
56
+ - **Auto-pagination** — automatically follows "next page" links
57
+ - **JS rendering** — optional Playwright backend for JavaScript-heavy sites
58
+ - **Custom selectors** — pass CSS selectors to extract exactly what you need
59
+ - **Built-in scrapers** — Wikipedia, IMDB, Yahoo Finance, news (RSS), image search, Amazon, LinkedIn
60
+ - **Clean API** — every scraper returns a `ScrapeResult` with `.to_dataframe()` and `.to_json()`
61
+ - **Retry & rate-limiting** — built-in exponential backoff and per-domain rate limiting
62
+ - **Type-safe** — full type hints, `py.typed` marker
63
+
64
+ ## Installation
65
+
66
+ ```sh
67
+ pip install pyscrappy
68
+ ```
69
+
70
+ **Optional extras:**
71
+
72
+ ```sh
73
+ # Browser support (for JS-rendered pages)
74
+ pip install 'pyscrappy[browser]'
75
+ playwright install chromium
76
+
77
+ # DataFrame support
78
+ pip install 'pyscrappy[dataframe]'
79
+
80
+ # Everything
81
+ pip install 'pyscrappy[all]'
82
+ ```
83
+
84
+ ## Quick start
85
+
86
+ ### Scrape any URL (one-liner)
87
+
88
+ ```python
89
+ from pyscrappy import scrape
90
+
91
+ result = scrape("https://en.wikipedia.org/wiki/Web_scraping")
92
+ print(result.data[0]["metadata"]["title"])
93
+ print(result.data[0]["text"]["word_count"])
94
+ ```
95
+
96
+ ### Custom CSS selectors
97
+
98
+ ```python
99
+ from pyscrappy import GenericScraper
100
+
101
+ with GenericScraper() as gs:
102
+ result = gs.scrape(
103
+ url="https://news.ycombinator.com",
104
+ selectors={"title": ".titleline a", "score": ".score"},
105
+ )
106
+ for item in result.data:
107
+ print(item["title"], item.get("score", ""))
108
+ ```
109
+
110
+ ### Wikipedia
111
+
112
+ ```python
113
+ from pyscrappy import WikipediaScraper
114
+
115
+ with WikipediaScraper() as ws:
116
+ result = ws.scrape(query="Python (programming language)", mode="summary")
117
+ print(result.data[0]["text"])
118
+ ```
119
+
120
+ ### Stock data
121
+
122
+ ```python
123
+ from pyscrappy import StockScraper
124
+
125
+ with StockScraper() as ss:
126
+ result = ss.scrape(symbol="AAPL", mode="history", period="1mo")
127
+ df = result.to_dataframe()
128
+ print(df.head())
129
+ ```
130
+
131
+ ### IMDB
132
+
133
+ ```python
134
+ from pyscrappy import IMDBScraper
135
+
136
+ with IMDBScraper() as scraper:
137
+ result = scraper.scrape(genre="sci-fi", max_pages=2)
138
+ df = result.to_dataframe()
139
+ print(df[["title", "year", "rating"]])
140
+ ```
141
+
142
+ ### News (RSS feeds)
143
+
144
+ ```python
145
+ from pyscrappy import NewsScraper
146
+
147
+ with NewsScraper() as ns:
148
+ result = ns.scrape(feed_url="https://rss.nytimes.com/services/xml/rss/nyt/World.xml")
149
+ for article in result.data[:5]:
150
+ print(article["title"])
151
+ ```
152
+
153
+ ### Image search
154
+
155
+ ```python
156
+ from pyscrappy import ImageSearchScraper
157
+
158
+ with ImageSearchScraper() as iss:
159
+ result = iss.scrape(query="golden retriever", max_images=10, download_to="./dogs")
160
+ ```
161
+
162
+ ## Configuration
163
+
164
+ ```python
165
+ from pyscrappy import ScraperConfig, GenericScraper
166
+
167
+ config = ScraperConfig(
168
+ timeout=20.0, # request timeout in seconds
169
+ max_retries=3, # retry failed requests
170
+ rate_limit=2.0, # seconds between requests per domain
171
+ proxy="http://...", # HTTP/SOCKS proxy
172
+ headless=True, # browser runs headless
173
+ render_js="auto", # auto-detect if JS rendering is needed
174
+ )
175
+
176
+ with GenericScraper(config) as gs:
177
+ result = gs.scrape(url="https://example.com")
178
+ ```
179
+
180
+ ### YouTube
181
+
182
+ ```python
183
+ from pyscrappy import YouTubeScraper
184
+
185
+ with YouTubeScraper() as scraper:
186
+ result = scraper.scrape(query="python tutorial", max_results=10)
187
+ for video in result.data:
188
+ print(video["title"], video.get("views", ""))
189
+ ```
190
+
191
+ ### SoundCloud
192
+
193
+ ```python
194
+ from pyscrappy import SoundCloudScraper
195
+
196
+ with SoundCloudScraper() as scraper:
197
+ result = scraper.scrape(query="lo-fi beats", max_results=10)
198
+ ```
199
+
200
+ ### E-Commerce (Alibaba, Flipkart, Snapdeal)
201
+
202
+ ```python
203
+ from pyscrappy import AlibabaScraper, FlipkartScraper, SnapdealScraper
204
+
205
+ with FlipkartScraper() as scraper:
206
+ result = scraper.scrape(query="laptop", max_pages=2)
207
+ df = result.to_dataframe()
208
+ ```
209
+
210
+ ### Food Delivery (Swiggy, Zomato)
211
+
212
+ ```python
213
+ from pyscrappy import SwiggyScraper, ZomatoScraper
214
+
215
+ # These are JS-heavy — use render_js=True for best results
216
+ with SwiggyScraper() as scraper:
217
+ result = scraper.scrape(city="bangalore", render_js=True)
218
+ ```
219
+
220
+ ## Built-in scrapers
221
+
222
+ | Scraper | What it does | Needs browser? |
223
+ |---------|-------------|----------------|
224
+ | `GenericScraper` | Scrape any URL with auto-extraction | Optional |
225
+ | **Data / Research** | | |
226
+ | `WikipediaScraper` | Articles, sections, infoboxes | No |
227
+ | `IMDBScraper` | Movies by genre, search, charts | No |
228
+ | `StockScraper` | Quotes, history, profiles (Yahoo Finance) | No |
229
+ | `NewsScraper` | RSS/Atom feeds, article extraction | No |
230
+ | `ImageSearchScraper` | Image search + download | No |
231
+ | `LinkedInJobsScraper` | Public job listings | No |
232
+ | **E-Commerce** | | |
233
+ | `AmazonScraper` | Product search | No |
234
+ | `AlibabaScraper` | Product search | No |
235
+ | `FlipkartScraper` | Product search | No |
236
+ | `SnapdealScraper` | Product search | No |
237
+ | **Social Media** | | |
238
+ | `YouTubeScraper` | Video search, channel scraping | Optional |
239
+ | `InstagramScraper` | Profiles, hashtag posts | Recommended |
240
+ | `TwitterScraper` | Tweet search | Recommended |
241
+ | **Music** | | |
242
+ | `SpotifyScraper` | Track/playlist search | Recommended |
243
+ | `SoundCloudScraper` | Track search | Optional |
244
+ | **Food Delivery** | | |
245
+ | `SwiggyScraper` | Restaurant listings | Recommended |
246
+ | `ZomatoScraper` | Restaurant listings | Recommended |
247
+
248
+ ## Dependencies
249
+
250
+ **Required:** `httpx`, `beautifulsoup4`, `lxml`
251
+
252
+ **Optional:** `playwright` (JS rendering), `pandas` (DataFrames)
253
+
254
+ ## License
255
+
256
+ [MIT](https://github.com/mldsveda/PyScrappy/blob/main/LICENSE)
257
+
258
+ ## Contributing
259
+
260
+ All contributions welcome. See [Issues](https://github.com/mldsveda/PyScrappy/issues).
261
+
262
+ **This package is for educational and research purposes.**
Binary file