pyscrappy 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyscrappy-1.0.0/.github/workflows/ci.yml +41 -0
- pyscrappy-1.0.0/.github/workflows/python-publish.yml +24 -0
- pyscrappy-1.0.0/.gitignore +9 -0
- pyscrappy-1.0.0/CONTRIBUTING.md +312 -0
- pyscrappy-1.0.0/LICENSE +21 -0
- pyscrappy-1.0.0/PKG-INFO +262 -0
- pyscrappy-1.0.0/PyScrappy.png +0 -0
- pyscrappy-1.0.0/README.md +223 -0
- pyscrappy-1.0.0/docs/evidence-cover-sheet.md +144 -0
- pyscrappy-1.0.0/pyproject.toml +67 -0
- pyscrappy-1.0.0/src/pyscrappy/__init__.py +131 -0
- pyscrappy-1.0.0/src/pyscrappy/core/__init__.py +27 -0
- pyscrappy-1.0.0/src/pyscrappy/core/base.py +76 -0
- pyscrappy-1.0.0/src/pyscrappy/core/browser.py +110 -0
- pyscrappy-1.0.0/src/pyscrappy/core/config.py +47 -0
- pyscrappy-1.0.0/src/pyscrappy/core/exceptions.py +31 -0
- pyscrappy-1.0.0/src/pyscrappy/core/http.py +140 -0
- pyscrappy-1.0.0/src/pyscrappy/core/models.py +84 -0
- pyscrappy-1.0.0/src/pyscrappy/generic/__init__.py +3 -0
- pyscrappy-1.0.0/src/pyscrappy/generic/extractors.py +204 -0
- pyscrappy-1.0.0/src/pyscrappy/generic/pagination.py +84 -0
- pyscrappy-1.0.0/src/pyscrappy/generic/scraper.py +229 -0
- pyscrappy-1.0.0/src/pyscrappy/py.typed +0 -0
- pyscrappy-1.0.0/src/pyscrappy/scrapers/__init__.py +37 -0
- pyscrappy-1.0.0/src/pyscrappy/scrapers/alibaba.py +149 -0
- pyscrappy-1.0.0/src/pyscrappy/scrapers/amazon.py +162 -0
- pyscrappy-1.0.0/src/pyscrappy/scrapers/flipkart.py +149 -0
- pyscrappy-1.0.0/src/pyscrappy/scrapers/image_search.py +170 -0
- pyscrappy-1.0.0/src/pyscrappy/scrapers/imdb.py +223 -0
- pyscrappy-1.0.0/src/pyscrappy/scrapers/instagram.py +253 -0
- pyscrappy-1.0.0/src/pyscrappy/scrapers/linkedin.py +148 -0
- pyscrappy-1.0.0/src/pyscrappy/scrapers/news.py +230 -0
- pyscrappy-1.0.0/src/pyscrappy/scrapers/snapdeal.py +130 -0
- pyscrappy-1.0.0/src/pyscrappy/scrapers/soundcloud.py +184 -0
- pyscrappy-1.0.0/src/pyscrappy/scrapers/spotify.py +234 -0
- pyscrappy-1.0.0/src/pyscrappy/scrapers/stock.py +222 -0
- pyscrappy-1.0.0/src/pyscrappy/scrapers/swiggy.py +215 -0
- pyscrappy-1.0.0/src/pyscrappy/scrapers/twitter.py +174 -0
- pyscrappy-1.0.0/src/pyscrappy/scrapers/wikipedia.py +195 -0
- pyscrappy-1.0.0/src/pyscrappy/scrapers/youtube.py +211 -0
- pyscrappy-1.0.0/src/pyscrappy/scrapers/zomato.py +219 -0
- pyscrappy-1.0.0/tests/__init__.py +0 -0
- pyscrappy-1.0.0/tests/test_core/__init__.py +0 -0
- pyscrappy-1.0.0/tests/test_core/test_base.py +156 -0
- pyscrappy-1.0.0/tests/test_core/test_browser.py +156 -0
- pyscrappy-1.0.0/tests/test_core/test_config.py +60 -0
- pyscrappy-1.0.0/tests/test_core/test_exceptions.py +59 -0
- pyscrappy-1.0.0/tests/test_core/test_http.py +241 -0
- pyscrappy-1.0.0/tests/test_core/test_models.py +128 -0
- pyscrappy-1.0.0/tests/test_generic/__init__.py +0 -0
- pyscrappy-1.0.0/tests/test_generic/test_extractors.py +357 -0
- pyscrappy-1.0.0/tests/test_generic/test_pagination.py +139 -0
- pyscrappy-1.0.0/tests/test_generic/test_scraper.py +322 -0
- pyscrappy-1.0.0/tests/test_init.py +54 -0
- pyscrappy-1.0.0/tests/test_scrapers/__init__.py +0 -0
- pyscrappy-1.0.0/tests/test_scrapers/test_ecommerce.py +249 -0
- pyscrappy-1.0.0/tests/test_scrapers/test_food.py +219 -0
- pyscrappy-1.0.0/tests/test_scrapers/test_imdb.py +197 -0
- pyscrappy-1.0.0/tests/test_scrapers/test_music.py +238 -0
- pyscrappy-1.0.0/tests/test_scrapers/test_news.py +223 -0
- pyscrappy-1.0.0/tests/test_scrapers/test_other.py +223 -0
- pyscrappy-1.0.0/tests/test_scrapers/test_social.py +374 -0
- pyscrappy-1.0.0/tests/test_scrapers/test_stock.py +214 -0
- pyscrappy-1.0.0/tests/test_scrapers/test_wikipedia.py +155 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
lint:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
- uses: actions/setup-python@v5
|
|
15
|
+
with:
|
|
16
|
+
python-version: "3.12"
|
|
17
|
+
- run: pip install ruff
|
|
18
|
+
- run: ruff check src/
|
|
19
|
+
|
|
20
|
+
test:
|
|
21
|
+
runs-on: ubuntu-latest
|
|
22
|
+
strategy:
|
|
23
|
+
matrix:
|
|
24
|
+
python-version: ["3.9", "3.11", "3.12", "3.13"]
|
|
25
|
+
steps:
|
|
26
|
+
- uses: actions/checkout@v4
|
|
27
|
+
- uses: actions/setup-python@v5
|
|
28
|
+
with:
|
|
29
|
+
python-version: ${{ matrix.python-version }}
|
|
30
|
+
- run: pip install -e '.[all]' pytest
|
|
31
|
+
- run: pytest tests/ -v
|
|
32
|
+
|
|
33
|
+
build:
|
|
34
|
+
runs-on: ubuntu-latest
|
|
35
|
+
steps:
|
|
36
|
+
- uses: actions/checkout@v4
|
|
37
|
+
- uses: actions/setup-python@v5
|
|
38
|
+
with:
|
|
39
|
+
python-version: "3.12"
|
|
40
|
+
- run: pip install build
|
|
41
|
+
- run: python -m build
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
name: Upload Python Package
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
deploy:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
permissions:
|
|
11
|
+
id-token: write
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
- uses: actions/setup-python@v5
|
|
15
|
+
with:
|
|
16
|
+
python-version: "3.12"
|
|
17
|
+
- name: Install build tools
|
|
18
|
+
run: pip install build
|
|
19
|
+
- name: Build package
|
|
20
|
+
run: python -m build
|
|
21
|
+
- name: Publish to PyPI
|
|
22
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
23
|
+
with:
|
|
24
|
+
password: ${{ secrets.PYPI_API_TOKEN }}
|
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
# Contributing to PyScrappy
|
|
2
|
+
|
|
3
|
+
Thank you for your interest in contributing to PyScrappy! We welcome contributions of all kinds — from bug fixes and documentation improvements to new scrapers and core features.
|
|
4
|
+
|
|
5
|
+
## Checklist before submitting a PR
|
|
6
|
+
|
|
7
|
+
Here are the core requirements for any PR submitted to PyScrappy:
|
|
8
|
+
|
|
9
|
+
- [ ] **Keep scope isolated** — your changes should address 1 specific problem at a time
|
|
10
|
+
- [ ] **Add tests** — adding at least 1 test is a hard requirement — [see details](#adding-tests)
|
|
11
|
+
- [ ] **Ensure your PR passes all checks:**
|
|
12
|
+
- [ ] Unit tests — `pytest tests/ -v`
|
|
13
|
+
- [ ] Linting — `ruff check src/`
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
### 1. Setup Your Local Development Environment
|
|
18
|
+
|
|
19
|
+
```sh
|
|
20
|
+
# Fork the repository on GitHub (click the Fork button at https://github.com/mldsveda/PyScrappy)
|
|
21
|
+
# Then clone your fork locally
|
|
22
|
+
git clone https://github.com/YOUR_USERNAME/PyScrappy.git
|
|
23
|
+
cd PyScrappy
|
|
24
|
+
|
|
25
|
+
# Create a new branch for your feature
|
|
26
|
+
git checkout -b your-feature-branch
|
|
27
|
+
|
|
28
|
+
# Install the package in editable mode with all extras
|
|
29
|
+
pip install -e '.[all]'
|
|
30
|
+
|
|
31
|
+
# Install development tools
|
|
32
|
+
pip install pytest ruff mypy
|
|
33
|
+
|
|
34
|
+
# Verify your setup works
|
|
35
|
+
pytest tests/ -v
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
That's it! Your local development environment is ready.
|
|
39
|
+
|
|
40
|
+
### 2. Development Workflow
|
|
41
|
+
|
|
42
|
+
Here's the recommended workflow for making changes:
|
|
43
|
+
|
|
44
|
+
```sh
|
|
45
|
+
# Make your changes to the code
|
|
46
|
+
# ...
|
|
47
|
+
|
|
48
|
+
# Run linting to catch issues early
|
|
49
|
+
ruff check src/
|
|
50
|
+
|
|
51
|
+
# Run the full test suite
|
|
52
|
+
pytest tests/ -v
|
|
53
|
+
|
|
54
|
+
# Commit your changes
|
|
55
|
+
git add .
|
|
56
|
+
git commit -m "Your descriptive commit message"
|
|
57
|
+
|
|
58
|
+
# Push and create a PR
|
|
59
|
+
git push origin your-feature-branch
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Adding Tests
|
|
63
|
+
|
|
64
|
+
Adding at least 1 test is a **hard requirement** for all PRs.
|
|
65
|
+
|
|
66
|
+
### Where to Add Tests
|
|
67
|
+
|
|
68
|
+
| What you changed | Where to add tests |
|
|
69
|
+
|---|---|
|
|
70
|
+
| `src/pyscrappy/core/` | `tests/test_core/` |
|
|
71
|
+
| `src/pyscrappy/generic/` | `tests/test_generic/` |
|
|
72
|
+
| `src/pyscrappy/scrapers/` | `tests/test_scrapers/` |
|
|
73
|
+
| Package-level (`__init__.py`) | `tests/test_init.py` |
|
|
74
|
+
|
|
75
|
+
### File Naming Convention
|
|
76
|
+
|
|
77
|
+
The `tests/` directory mirrors the structure of `src/pyscrappy/`:
|
|
78
|
+
|
|
79
|
+
| Source file | Test file |
|
|
80
|
+
|---|---|
|
|
81
|
+
| `src/pyscrappy/core/config.py` | `tests/test_core/test_config.py` |
|
|
82
|
+
| `src/pyscrappy/core/http.py` | `tests/test_core/test_http.py` |
|
|
83
|
+
| `src/pyscrappy/generic/extractors.py` | `tests/test_generic/test_extractors.py` |
|
|
84
|
+
| `src/pyscrappy/scrapers/wikipedia.py` | `tests/test_scrapers/test_wikipedia.py` |
|
|
85
|
+
|
|
86
|
+
### Key Testing Principles
|
|
87
|
+
|
|
88
|
+
- **Mock HTTP calls** — never make real network requests in tests. Use `unittest.mock.MagicMock` to mock the `_http` attribute on scrapers.
|
|
89
|
+
- **Test parsing logic** — provide realistic sample HTML/JSON and verify the scraper extracts the correct fields.
|
|
90
|
+
- **Test edge cases** — empty responses, missing fields, malformed HTML.
|
|
91
|
+
- **Test validation** — ensure proper errors are raised for invalid arguments.
|
|
92
|
+
|
|
93
|
+
### Example Test
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from unittest.mock import MagicMock
|
|
97
|
+
from pyscrappy.scrapers.wikipedia import WikipediaScraper
|
|
98
|
+
|
|
99
|
+
SAMPLE_HTML = """
|
|
100
|
+
<html><body>
|
|
101
|
+
<div id="mw-content-text">
|
|
102
|
+
<div class="mw-parser-output">
|
|
103
|
+
<p>Python is a high-level programming language.</p>
|
|
104
|
+
</div>
|
|
105
|
+
</div>
|
|
106
|
+
</body></html>
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
def test_wikipedia_scrape_returns_paragraphs():
|
|
110
|
+
"""Test that WikipediaScraper extracts paragraph text."""
|
|
111
|
+
scraper = WikipediaScraper()
|
|
112
|
+
mock_http = MagicMock()
|
|
113
|
+
mock_http.get_html.return_value = SAMPLE_HTML
|
|
114
|
+
scraper._http = mock_http
|
|
115
|
+
|
|
116
|
+
result = scraper.scrape(query="Python", mode="paragraphs")
|
|
117
|
+
|
|
118
|
+
assert len(result.data) > 0
|
|
119
|
+
assert result.data[0]["type"] == "paragraph"
|
|
120
|
+
assert "Python" in result.data[0]["text"]
|
|
121
|
+
scraper.close()
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Running Tests and Checks
|
|
125
|
+
|
|
126
|
+
### Running Unit Tests
|
|
127
|
+
|
|
128
|
+
Run the full test suite:
|
|
129
|
+
|
|
130
|
+
```sh
|
|
131
|
+
pytest tests/ -v
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Run a specific test file:
|
|
135
|
+
|
|
136
|
+
```sh
|
|
137
|
+
pytest tests/test_core/test_http.py -v
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
Run a specific test:
|
|
141
|
+
|
|
142
|
+
```sh
|
|
143
|
+
pytest tests/test_scrapers/test_wikipedia.py::TestWikipediaScraperFull::test_full_mode -v
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### Running Linting
|
|
147
|
+
|
|
148
|
+
Run Ruff linting (matches CI):
|
|
149
|
+
|
|
150
|
+
```sh
|
|
151
|
+
ruff check src/
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
Auto-fix linting issues:
|
|
155
|
+
|
|
156
|
+
```sh
|
|
157
|
+
ruff check src/ --fix
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### Running Type Checks (optional)
|
|
161
|
+
|
|
162
|
+
```sh
|
|
163
|
+
mypy src/pyscrappy/
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
### CI Compatibility
|
|
167
|
+
|
|
168
|
+
To ensure your changes will pass CI, run the same checks locally:
|
|
169
|
+
|
|
170
|
+
```sh
|
|
171
|
+
# These match the GitHub Actions workflows exactly
|
|
172
|
+
ruff check src/
|
|
173
|
+
pytest tests/ -v
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
CI runs tests across Python 3.9, 3.11, 3.12, and 3.13.
|
|
177
|
+
|
|
178
|
+
## Project Structure
|
|
179
|
+
|
|
180
|
+
```
|
|
181
|
+
PyScrappy/
|
|
182
|
+
├── src/pyscrappy/
|
|
183
|
+
│ ├── __init__.py # Package exports and convenience scrape() function
|
|
184
|
+
│ ├── core/ # Core infrastructure
|
|
185
|
+
│ │ ├── base.py # BaseScraper abstract class
|
|
186
|
+
│ │ ├── browser.py # Playwright browser manager
|
|
187
|
+
│ │ ├── config.py # ScraperConfig dataclass
|
|
188
|
+
│ │ ├── exceptions.py # Custom exception hierarchy
|
|
189
|
+
│ │ ├── http.py # HTTP client with retries/rate-limiting
|
|
190
|
+
│ │ └── models.py # ScrapeResult, ScrapeMetadata, ScrapeError
|
|
191
|
+
│ ├── generic/ # GenericScraper (works on any URL)
|
|
192
|
+
│ │ ├── scraper.py # Main GenericScraper class
|
|
193
|
+
│ │ ├── extractors.py # Metadata, Text, Link, Image, Table extractors
|
|
194
|
+
│ │ └── pagination.py # Auto-pagination detection
|
|
195
|
+
│ └── scrapers/ # Site-specific scrapers (16 total)
|
|
196
|
+
│ ├── wikipedia.py
|
|
197
|
+
│ ├── imdb.py
|
|
198
|
+
│ ├── stock.py
|
|
199
|
+
│ └── ...
|
|
200
|
+
├── tests/
|
|
201
|
+
│ ├── test_core/ # Tests for core/
|
|
202
|
+
│ ├── test_generic/ # Tests for generic/
|
|
203
|
+
│ ├── test_scrapers/ # Tests for scrapers/
|
|
204
|
+
│ └── test_init.py # Package-level import tests
|
|
205
|
+
└── pyproject.toml # Build config, dependencies, tool settings
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
## Adding a New Scraper
|
|
209
|
+
|
|
210
|
+
Want to add support for a new website? Here's how:
|
|
211
|
+
|
|
212
|
+
### 1. Create the scraper
|
|
213
|
+
|
|
214
|
+
Create a new file in `src/pyscrappy/scrapers/`, e.g. `mysite.py`:
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
from __future__ import annotations
|
|
218
|
+
from typing import Any
|
|
219
|
+
from pyscrappy.core.base import BaseScraper
|
|
220
|
+
from pyscrappy.core.models import ScrapeMetadata, ScrapeResult
|
|
221
|
+
|
|
222
|
+
class MySiteScraper(BaseScraper):
|
|
223
|
+
name = "mysite"
|
|
224
|
+
|
|
225
|
+
def scrape(self, query: str, **kwargs: object) -> ScrapeResult:
|
|
226
|
+
url = f"https://mysite.com/search?q={query}"
|
|
227
|
+
soup = self.fetch_and_parse(url)
|
|
228
|
+
|
|
229
|
+
items: list[dict[str, Any]] = []
|
|
230
|
+
for card in soup.select(".result-card"):
|
|
231
|
+
items.append({
|
|
232
|
+
"title": card.select_one("h2").get_text(strip=True),
|
|
233
|
+
"url": card.select_one("a")["href"],
|
|
234
|
+
})
|
|
235
|
+
|
|
236
|
+
return ScrapeResult(
|
|
237
|
+
data=items,
|
|
238
|
+
metadata=ScrapeMetadata(source_urls=[url], scraper=self.name),
|
|
239
|
+
)
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
### 2. Register the export
|
|
243
|
+
|
|
244
|
+
Add your scraper to:
|
|
245
|
+
- `src/pyscrappy/scrapers/__init__.py`
|
|
246
|
+
- `src/pyscrappy/__init__.py` (import + add to `__all__`)
|
|
247
|
+
|
|
248
|
+
### 3. Add tests
|
|
249
|
+
|
|
250
|
+
Create `tests/test_scrapers/test_mysite.py` with mock HTML and assertions.
|
|
251
|
+
|
|
252
|
+
### 4. Submit your PR
|
|
253
|
+
|
|
254
|
+
## Code Quality Standards
|
|
255
|
+
|
|
256
|
+
- **Style** — enforced by [Ruff](https://docs.astral.sh/ruff/) with a 100-character line length
|
|
257
|
+
- **Type hints** — all public APIs should have type annotations
|
|
258
|
+
- **Python version** — must be compatible with Python 3.9+
|
|
259
|
+
- **No real HTTP calls in tests** — always mock network requests
|
|
260
|
+
- **Match existing patterns** — follow the conventions you see in existing scrapers
|
|
261
|
+
|
|
262
|
+
## Common Issues and Solutions
|
|
263
|
+
|
|
264
|
+
### Linting Failures
|
|
265
|
+
|
|
266
|
+
If `ruff check src/` fails:
|
|
267
|
+
- Run `ruff check src/ --fix` to auto-fix most issues
|
|
268
|
+
- Check import ordering (Ruff enforces isort-compatible ordering)
|
|
269
|
+
- Ensure lines are under 100 characters
|
|
270
|
+
|
|
271
|
+
### Test Failures
|
|
272
|
+
|
|
273
|
+
If `pytest tests/ -v` fails:
|
|
274
|
+
- Check if you broke existing functionality
|
|
275
|
+
- Ensure tests use mocks, not real API calls
|
|
276
|
+
- Check test file naming conventions (`test_*.py`)
|
|
277
|
+
- Make sure `__init__.py` exists in test directories
|
|
278
|
+
|
|
279
|
+
### Import Errors
|
|
280
|
+
|
|
281
|
+
If you get `ModuleNotFoundError: No module named 'pyscrappy'`:
|
|
282
|
+
```sh
|
|
283
|
+
pip install -e '.[all]'
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
## Submitting Your PR
|
|
287
|
+
|
|
288
|
+
1. **Push your branch:** `git push origin your-feature-branch`
|
|
289
|
+
2. **Create a PR:** go to GitHub and create a pull request against `main`
|
|
290
|
+
3. **Fill out the description:** clearly explain what your changes do and why
|
|
291
|
+
4. **Wait for CI:** ensure all checks pass (lint, tests across Python versions, build)
|
|
292
|
+
5. **Address feedback:** make requested changes and push updates
|
|
293
|
+
6. **Merge:** once approved, your PR will be merged!
|
|
294
|
+
|
|
295
|
+
## Getting Help
|
|
296
|
+
|
|
297
|
+
If you need help:
|
|
298
|
+
|
|
299
|
+
- [Create an issue](https://github.com/mldsveda/PyScrappy/issues)
|
|
300
|
+
- Check existing [discussions](https://github.com/mldsveda/PyScrappy/issues) for similar questions
|
|
301
|
+
|
|
302
|
+
## What to Contribute
|
|
303
|
+
|
|
304
|
+
Looking for ideas? Check out:
|
|
305
|
+
|
|
306
|
+
- **Bug fixes** — check [open issues](https://github.com/mldsveda/PyScrappy/issues)
|
|
307
|
+
- **New scrapers** — add support for a website you use
|
|
308
|
+
- **Test coverage** — improve tests for existing scrapers
|
|
309
|
+
- **Documentation** — improve docstrings, examples, or guides
|
|
310
|
+
- **Core improvements** — better error handling, caching, async support
|
|
311
|
+
|
|
312
|
+
Thank you for contributing to PyScrappy!
|
pyscrappy-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2022 Vedant Tibrewal, Vedaant Singh.
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
pyscrappy-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pyscrappy
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A robust, all-in-one Python web scraping toolkit
|
|
5
|
+
Project-URL: Homepage, https://github.com/mldsveda/PyScrappy
|
|
6
|
+
Project-URL: Repository, https://github.com/mldsveda/PyScrappy
|
|
7
|
+
Project-URL: Issues, https://github.com/mldsveda/PyScrappy/issues
|
|
8
|
+
Author: Vedaant Singh
|
|
9
|
+
Author-email: Vedant Tibrewal <mlds93363@gmail.com>
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: beautifulsoup,crawler,data-extraction,httpx,playwright,scraping,web-scraping
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
24
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
25
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
26
|
+
Classifier: Typing :: Typed
|
|
27
|
+
Requires-Python: >=3.9
|
|
28
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
29
|
+
Requires-Dist: httpx>=0.24
|
|
30
|
+
Requires-Dist: lxml>=4.9
|
|
31
|
+
Provides-Extra: all
|
|
32
|
+
Requires-Dist: pandas>=1.5; extra == 'all'
|
|
33
|
+
Requires-Dist: playwright>=1.40; extra == 'all'
|
|
34
|
+
Provides-Extra: browser
|
|
35
|
+
Requires-Dist: playwright>=1.40; extra == 'browser'
|
|
36
|
+
Provides-Extra: dataframe
|
|
37
|
+
Requires-Dist: pandas>=1.5; extra == 'dataframe'
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
|
|
40
|
+
<div align="center">
|
|
41
|
+
<img src="https://raw.githubusercontent.com/mldsveda/PyScrappy/main/PyScrappy.png">
|
|
42
|
+
<hr>
|
|
43
|
+
</div>
|
|
44
|
+
|
|
45
|
+
## PyScrappy: robust, all-in-one Python web scraping toolkit
|
|
46
|
+
|
|
47
|
+
[](https://www.python.org/downloads/)
|
|
48
|
+
[](https://pypi.org/project/PyScrappy/)
|
|
49
|
+
[](https://github.com/mldsveda/PyScrappy/blob/main/LICENSE)
|
|
50
|
+
|
|
51
|
+
PyScrappy is a Python toolkit for web scraping that works out of the box. Point it at any URL and get structured data back — or use built-in scrapers for Wikipedia, IMDB, Yahoo Finance, news feeds, and more.
|
|
52
|
+
|
|
53
|
+
### Key features
|
|
54
|
+
|
|
55
|
+
- **Generic scraper** — give it any URL, get back structured text, links, images, tables, and metadata
|
|
56
|
+
- **Auto-pagination** — automatically follows "next page" links
|
|
57
|
+
- **JS rendering** — optional Playwright backend for JavaScript-heavy sites
|
|
58
|
+
- **Custom selectors** — pass CSS selectors to extract exactly what you need
|
|
59
|
+
- **Built-in scrapers** — Wikipedia, IMDB, Yahoo Finance, news (RSS), image search, Amazon, LinkedIn
|
|
60
|
+
- **Clean API** — every scraper returns a `ScrapeResult` with `.to_dataframe()` and `.to_json()`
|
|
61
|
+
- **Retry & rate-limiting** — built-in exponential backoff and per-domain rate limiting
|
|
62
|
+
- **Type-safe** — full type hints, `py.typed` marker
|
|
63
|
+
|
|
64
|
+
## Installation
|
|
65
|
+
|
|
66
|
+
```sh
|
|
67
|
+
pip install pyscrappy
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
**Optional extras:**
|
|
71
|
+
|
|
72
|
+
```sh
|
|
73
|
+
# Browser support (for JS-rendered pages)
|
|
74
|
+
pip install 'pyscrappy[browser]'
|
|
75
|
+
playwright install chromium
|
|
76
|
+
|
|
77
|
+
# DataFrame support
|
|
78
|
+
pip install 'pyscrappy[dataframe]'
|
|
79
|
+
|
|
80
|
+
# Everything
|
|
81
|
+
pip install 'pyscrappy[all]'
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Quick start
|
|
85
|
+
|
|
86
|
+
### Scrape any URL (one-liner)
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from pyscrappy import scrape
|
|
90
|
+
|
|
91
|
+
result = scrape("https://en.wikipedia.org/wiki/Web_scraping")
|
|
92
|
+
print(result.data[0]["metadata"]["title"])
|
|
93
|
+
print(result.data[0]["text"]["word_count"])
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Custom CSS selectors
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from pyscrappy import GenericScraper
|
|
100
|
+
|
|
101
|
+
with GenericScraper() as gs:
|
|
102
|
+
result = gs.scrape(
|
|
103
|
+
url="https://news.ycombinator.com",
|
|
104
|
+
selectors={"title": ".titleline a", "score": ".score"},
|
|
105
|
+
)
|
|
106
|
+
for item in result.data:
|
|
107
|
+
print(item["title"], item.get("score", ""))
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Wikipedia
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
from pyscrappy import WikipediaScraper
|
|
114
|
+
|
|
115
|
+
with WikipediaScraper() as ws:
|
|
116
|
+
result = ws.scrape(query="Python (programming language)", mode="summary")
|
|
117
|
+
print(result.data[0]["text"])
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Stock data
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
from pyscrappy import StockScraper
|
|
124
|
+
|
|
125
|
+
with StockScraper() as ss:
|
|
126
|
+
result = ss.scrape(symbol="AAPL", mode="history", period="1mo")
|
|
127
|
+
df = result.to_dataframe()
|
|
128
|
+
print(df.head())
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### IMDB
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
from pyscrappy import IMDBScraper
|
|
135
|
+
|
|
136
|
+
with IMDBScraper() as scraper:
|
|
137
|
+
result = scraper.scrape(genre="sci-fi", max_pages=2)
|
|
138
|
+
df = result.to_dataframe()
|
|
139
|
+
print(df[["title", "year", "rating"]])
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### News (RSS feeds)
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
from pyscrappy import NewsScraper
|
|
146
|
+
|
|
147
|
+
with NewsScraper() as ns:
|
|
148
|
+
result = ns.scrape(feed_url="https://rss.nytimes.com/services/xml/rss/nyt/World.xml")
|
|
149
|
+
for article in result.data[:5]:
|
|
150
|
+
print(article["title"])
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### Image search
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
from pyscrappy import ImageSearchScraper
|
|
157
|
+
|
|
158
|
+
with ImageSearchScraper() as iss:
|
|
159
|
+
result = iss.scrape(query="golden retriever", max_images=10, download_to="./dogs")
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
## Configuration
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
from pyscrappy import ScraperConfig, GenericScraper
|
|
166
|
+
|
|
167
|
+
config = ScraperConfig(
|
|
168
|
+
timeout=20.0, # request timeout in seconds
|
|
169
|
+
max_retries=3, # retry failed requests
|
|
170
|
+
rate_limit=2.0, # seconds between requests per domain
|
|
171
|
+
proxy="http://...", # HTTP/SOCKS proxy
|
|
172
|
+
headless=True, # browser runs headless
|
|
173
|
+
render_js="auto", # auto-detect if JS rendering is needed
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
with GenericScraper(config) as gs:
|
|
177
|
+
result = gs.scrape(url="https://example.com")
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### YouTube
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
from pyscrappy import YouTubeScraper
|
|
184
|
+
|
|
185
|
+
with YouTubeScraper() as scraper:
|
|
186
|
+
result = scraper.scrape(query="python tutorial", max_results=10)
|
|
187
|
+
for video in result.data:
|
|
188
|
+
print(video["title"], video.get("views", ""))
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### SoundCloud
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
from pyscrappy import SoundCloudScraper
|
|
195
|
+
|
|
196
|
+
with SoundCloudScraper() as scraper:
|
|
197
|
+
result = scraper.scrape(query="lo-fi beats", max_results=10)
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
### E-Commerce (Alibaba, Flipkart, Snapdeal)
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
from pyscrappy import AlibabaScraper, FlipkartScraper, SnapdealScraper
|
|
204
|
+
|
|
205
|
+
with FlipkartScraper() as scraper:
|
|
206
|
+
result = scraper.scrape(query="laptop", max_pages=2)
|
|
207
|
+
df = result.to_dataframe()
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
### Food Delivery (Swiggy, Zomato)
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
from pyscrappy import SwiggyScraper, ZomatoScraper
|
|
214
|
+
|
|
215
|
+
# These are JS-heavy — use render_js=True for best results
|
|
216
|
+
with SwiggyScraper() as scraper:
|
|
217
|
+
result = scraper.scrape(city="bangalore", render_js=True)
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
## Built-in scrapers
|
|
221
|
+
|
|
222
|
+
| Scraper | What it does | Needs browser? |
|
|
223
|
+
|---------|-------------|----------------|
|
|
224
|
+
| `GenericScraper` | Scrape any URL with auto-extraction | Optional |
|
|
225
|
+
| **Data / Research** | | |
|
|
226
|
+
| `WikipediaScraper` | Articles, sections, infoboxes | No |
|
|
227
|
+
| `IMDBScraper` | Movies by genre, search, charts | No |
|
|
228
|
+
| `StockScraper` | Quotes, history, profiles (Yahoo Finance) | No |
|
|
229
|
+
| `NewsScraper` | RSS/Atom feeds, article extraction | No |
|
|
230
|
+
| `ImageSearchScraper` | Image search + download | No |
|
|
231
|
+
| `LinkedInJobsScraper` | Public job listings | No |
|
|
232
|
+
| **E-Commerce** | | |
|
|
233
|
+
| `AmazonScraper` | Product search | No |
|
|
234
|
+
| `AlibabaScraper` | Product search | No |
|
|
235
|
+
| `FlipkartScraper` | Product search | No |
|
|
236
|
+
| `SnapdealScraper` | Product search | No |
|
|
237
|
+
| **Social Media** | | |
|
|
238
|
+
| `YouTubeScraper` | Video search, channel scraping | Optional |
|
|
239
|
+
| `InstagramScraper` | Profiles, hashtag posts | Recommended |
|
|
240
|
+
| `TwitterScraper` | Tweet search | Recommended |
|
|
241
|
+
| **Music** | | |
|
|
242
|
+
| `SpotifyScraper` | Track/playlist search | Recommended |
|
|
243
|
+
| `SoundCloudScraper` | Track search | Optional |
|
|
244
|
+
| **Food Delivery** | | |
|
|
245
|
+
| `SwiggyScraper` | Restaurant listings | Recommended |
|
|
246
|
+
| `ZomatoScraper` | Restaurant listings | Recommended |
|
|
247
|
+
|
|
248
|
+
## Dependencies
|
|
249
|
+
|
|
250
|
+
**Required:** `httpx`, `beautifulsoup4`, `lxml`
|
|
251
|
+
|
|
252
|
+
**Optional:** `playwright` (JS rendering), `pandas` (DataFrames)
|
|
253
|
+
|
|
254
|
+
## License
|
|
255
|
+
|
|
256
|
+
[MIT](https://github.com/mldsveda/PyScrappy/blob/main/LICENSE)
|
|
257
|
+
|
|
258
|
+
## Contributing
|
|
259
|
+
|
|
260
|
+
All contributions welcome. See [Issues](https://github.com/mldsveda/PyScrappy/issues).
|
|
261
|
+
|
|
262
|
+
**This package is for educational and research purposes.**
|
|
Binary file
|