ghostscraper 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ghostscraper-0.0.1/PKG-INFO +223 -0
- ghostscraper-0.0.1/README.md +194 -0
- ghostscraper-0.0.1/data/ghostscraper/https-www-example-com.json +12 -0
- ghostscraper-0.0.1/ghostscraper/__init__.py +4 -0
- ghostscraper-0.0.1/ghostscraper/ghost_scraper.py +59 -0
- ghostscraper-0.0.1/ghostscraper/markdown_converter.py +277 -0
- ghostscraper-0.0.1/ghostscraper/playwright_installer.py +51 -0
- ghostscraper-0.0.1/ghostscraper/playwright_scraper.py +207 -0
- ghostscraper-0.0.1/pyproject.toml +46 -0
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ghostscraper
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: An asynchronous web scraper using Playwright with HTML to Markdown conversion
|
|
5
|
+
Project-URL: Homepage, https://github.com/Redundando/ghostscraper
|
|
6
|
+
Project-URL: Issues, https://github.com/Redundando/ghostscraper/issues
|
|
7
|
+
Author-email: Arved Klöhn <arved.kloehn@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Keywords: async,converter,html,markdown,playwright,web scraping
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
21
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
22
|
+
Requires-Python: >=3.8
|
|
23
|
+
Requires-Dist: beautifulsoup4>=4.10.0
|
|
24
|
+
Requires-Dist: cacherator
|
|
25
|
+
Requires-Dist: logorator
|
|
26
|
+
Requires-Dist: playwright>=1.30.0
|
|
27
|
+
Requires-Dist: python-slugify>=8.0.0
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# GhostScraper
|
|
31
|
+
|
|
32
|
+
GhostScraper is an asynchronous web scraping library built on top of Playwright that makes it easy to fetch and convert web content to Markdown format. It handles browser management, retries, and provides a clean interface for working with web content.
|
|
33
|
+
|
|
34
|
+
## Features
|
|
35
|
+
|
|
36
|
+
- Asynchronous web scraping with Playwright
|
|
37
|
+
- HTML to Markdown conversion
|
|
38
|
+
- Built-in retry mechanism with exponential backoff
|
|
39
|
+
- Result caching using JSONCache
|
|
40
|
+
- Smart content extraction
|
|
41
|
+
- Support for multiple browser types (Chromium, Firefox, WebKit)
|
|
42
|
+
|
|
43
|
+
## Installation
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install ghostscraper
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
GhostScraper will automatically install and manage required browsers during the first run.
|
|
50
|
+
|
|
51
|
+
## Basic Usage
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
import asyncio
|
|
55
|
+
from ghostscraper import GhostScraper
|
|
56
|
+
|
|
57
|
+
async def main():
|
|
58
|
+
# Create a scraper instance
|
|
59
|
+
scraper = GhostScraper(url="https://example.com")
|
|
60
|
+
|
|
61
|
+
# Get the HTML content
|
|
62
|
+
html = await scraper.html()
|
|
63
|
+
|
|
64
|
+
# Get the Markdown converted content
|
|
65
|
+
markdown = await scraper.markdown()
|
|
66
|
+
|
|
67
|
+
# Get the response code
|
|
68
|
+
status_code = await scraper.response_code()
|
|
69
|
+
|
|
70
|
+
print(f"Status code: {status_code}")
|
|
71
|
+
print(f"Markdown content:\n{markdown}")
|
|
72
|
+
|
|
73
|
+
# Run the async function
|
|
74
|
+
asyncio.run(main())
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## API Reference
|
|
78
|
+
|
|
79
|
+
### GhostScraper
|
|
80
|
+
|
|
81
|
+
The main class for scraping and converting web content.
|
|
82
|
+
|
|
83
|
+
#### Constructor
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
GhostScraper(
|
|
87
|
+
url: str = "",
|
|
88
|
+
clear_cache: bool = False,
|
|
89
|
+
markdown_options: Optional[Dict[str, Any]] = None,
|
|
90
|
+
**kwargs
|
|
91
|
+
)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
- `url`: The URL to scrape
|
|
95
|
+
- `clear_cache`: Whether to clear the cache before scraping
|
|
96
|
+
- `markdown_options`: Options for the Markdown converter
|
|
97
|
+
- `**kwargs`: Additional arguments passed to the PlaywrightScraper
|
|
98
|
+
|
|
99
|
+
#### Methods
|
|
100
|
+
|
|
101
|
+
- `async html() -> str`: Get the HTML content of the URL
|
|
102
|
+
- `async response_code() -> int`: Get the HTTP response code
|
|
103
|
+
- `async markdown() -> str`: Get the content converted to Markdown
|
|
104
|
+
- `async soup() -> BeautifulSoup`: Get a BeautifulSoup object for the HTML content
|
|
105
|
+
|
|
106
|
+
### **kwargs Keywords
|
|
107
|
+
|
|
108
|
+
The GhostScraper constructor accepts any keyword arguments and passes them directly to the underlying PlaywrightScraper. This allows you to customize the browser behavior without directly interacting with the PlaywrightScraper class.
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
# GhostScraper accepts all these keyword arguments which are passed to PlaywrightScraper
|
|
112
|
+
scraper = GhostScraper(
|
|
113
|
+
url="https://example.com",
|
|
114
|
+
browser_type="chromium", # Browser to use: "chromium", "firefox", or "webkit"
|
|
115
|
+
headless=True, # Run browser in headless mode
|
|
116
|
+
browser_args={}, # Arguments for browser launcher
|
|
117
|
+
context_args={}, # Arguments for browser context
|
|
118
|
+
max_retries=3, # Maximum retry attempts
|
|
119
|
+
backoff_factor=2.0, # Exponential backoff factor
|
|
120
|
+
network_idle_timeout=10000, # Network idle timeout (ms)
|
|
121
|
+
load_timeout=30000, # Page load timeout (ms)
|
|
122
|
+
wait_for_selectors=[] # CSS selectors to wait for
|
|
123
|
+
)
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
These keyword arguments configure how the page is loaded, browser behavior, and retry mechanisms.
|
|
127
|
+
|
|
128
|
+
## Advanced Usage
|
|
129
|
+
|
|
130
|
+
### Custom Markdown Options
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
from ghostscraper import GhostScraper
|
|
134
|
+
|
|
135
|
+
# Configure the Markdown converter
|
|
136
|
+
markdown_options = {
|
|
137
|
+
"strip_tags": ["script", "style", "nav", "footer", "header", "aside"],
|
|
138
|
+
"keep_tags": ["article", "main", "div", "section", "p"],
|
|
139
|
+
"content_selectors": ["article", "main", ".content", "#content"],
|
|
140
|
+
"preserve_images": True,
|
|
141
|
+
"preserve_links": True,
|
|
142
|
+
"preserve_tables": True,
|
|
143
|
+
"include_title": True,
|
|
144
|
+
"compact_output": False
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
# Create a scraper with custom Markdown options
|
|
148
|
+
scraper = GhostScraper(
|
|
149
|
+
url="https://example.com",
|
|
150
|
+
markdown_options=markdown_options
|
|
151
|
+
)
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Custom Browser Configuration
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
from ghostscraper import GhostScraper
|
|
158
|
+
|
|
159
|
+
# Create a scraper with custom browser settings
|
|
160
|
+
scraper = GhostScraper(
|
|
161
|
+
url="https://example.com",
|
|
162
|
+
# Browser configuration options (passed to PlaywrightScraper)
|
|
163
|
+
browser_type="firefox", # Use Firefox instead of Chromium
|
|
164
|
+
headless=False, # Show the browser window
|
|
165
|
+
max_retries=5, # Increase retry attempts
|
|
166
|
+
load_timeout=60000, # Increase load timeout to 60 seconds
|
|
167
|
+
wait_for_selectors=[".content", ".main-article"] # Wait for these selectors
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# You can also pass browser-specific arguments
|
|
171
|
+
scraper = GhostScraper(
|
|
172
|
+
url="https://example.com",
|
|
173
|
+
browser_args={
|
|
174
|
+
"proxy": { # Set up a proxy
|
|
175
|
+
"server": "http://myproxy.com:8080",
|
|
176
|
+
"username": "user",
|
|
177
|
+
"password": "pass"
|
|
178
|
+
},
|
|
179
|
+
"slowMo": 50, # Slow down browser operations by 50ms
|
|
180
|
+
},
|
|
181
|
+
context_args={
|
|
182
|
+
"userAgent": "Custom User Agent", # Set custom user agent
|
|
183
|
+
"viewport": {"width": 1920, "height": 1080} # Set viewport size
|
|
184
|
+
}
|
|
185
|
+
)
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
### Progressive Loading Strategy
|
|
189
|
+
|
|
190
|
+
GhostScraper uses a progressive loading strategy that tries different methods to load the page:
|
|
191
|
+
|
|
192
|
+
1. First tries with `networkidle` - waits until network is idle
|
|
193
|
+
2. If that fails, tries with `load` - waits for the load event
|
|
194
|
+
3. If that fails, tries with `domcontentloaded` - waits for DOM content loaded
|
|
195
|
+
|
|
196
|
+
This ensures maximum compatibility with different websites.
|
|
197
|
+
|
|
198
|
+
### Browser Installation
|
|
199
|
+
|
|
200
|
+
GhostScraper automatically checks if the required browser is installed and installs it if needed:
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
# Install browsers manually if needed
|
|
204
|
+
from ghostscraper import install_browser
|
|
205
|
+
|
|
206
|
+
# Install a specific browser type
|
|
207
|
+
install_browser("chromium")
|
|
208
|
+
install_browser("firefox")
|
|
209
|
+
install_browser("webkit")
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### Using Caching
|
|
213
|
+
|
|
214
|
+
By default, GhostScraper caches results in the `data/ghostscraper` directory. To clear the cache:
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
# Clear cache for a specific URL
|
|
218
|
+
scraper = GhostScraper(url="https://example.com", clear_cache=True)
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
## License
|
|
222
|
+
|
|
223
|
+
MIT
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
# GhostScraper
|
|
2
|
+
|
|
3
|
+
GhostScraper is an asynchronous web scraping library built on top of Playwright that makes it easy to fetch and convert web content to Markdown format. It handles browser management, retries, and provides a clean interface for working with web content.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Asynchronous web scraping with Playwright
|
|
8
|
+
- HTML to Markdown conversion
|
|
9
|
+
- Built-in retry mechanism with exponential backoff
|
|
10
|
+
- Result caching using JSONCache
|
|
11
|
+
- Smart content extraction
|
|
12
|
+
- Support for multiple browser types (Chromium, Firefox, WebKit)
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install ghostscraper
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
GhostScraper will automatically install and manage required browsers during the first run.
|
|
21
|
+
|
|
22
|
+
## Basic Usage
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
import asyncio
|
|
26
|
+
from ghostscraper import GhostScraper
|
|
27
|
+
|
|
28
|
+
async def main():
|
|
29
|
+
# Create a scraper instance
|
|
30
|
+
scraper = GhostScraper(url="https://example.com")
|
|
31
|
+
|
|
32
|
+
# Get the HTML content
|
|
33
|
+
html = await scraper.html()
|
|
34
|
+
|
|
35
|
+
# Get the Markdown converted content
|
|
36
|
+
markdown = await scraper.markdown()
|
|
37
|
+
|
|
38
|
+
# Get the response code
|
|
39
|
+
status_code = await scraper.response_code()
|
|
40
|
+
|
|
41
|
+
print(f"Status code: {status_code}")
|
|
42
|
+
print(f"Markdown content:\n{markdown}")
|
|
43
|
+
|
|
44
|
+
# Run the async function
|
|
45
|
+
asyncio.run(main())
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## API Reference
|
|
49
|
+
|
|
50
|
+
### GhostScraper
|
|
51
|
+
|
|
52
|
+
The main class for scraping and converting web content.
|
|
53
|
+
|
|
54
|
+
#### Constructor
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
GhostScraper(
|
|
58
|
+
url: str = "",
|
|
59
|
+
clear_cache: bool = False,
|
|
60
|
+
markdown_options: Optional[Dict[str, Any]] = None,
|
|
61
|
+
**kwargs
|
|
62
|
+
)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
- `url`: The URL to scrape
|
|
66
|
+
- `clear_cache`: Whether to clear the cache before scraping
|
|
67
|
+
- `markdown_options`: Options for the Markdown converter
|
|
68
|
+
- `**kwargs`: Additional arguments passed to the PlaywrightScraper
|
|
69
|
+
|
|
70
|
+
#### Methods
|
|
71
|
+
|
|
72
|
+
- `async html() -> str`: Get the HTML content of the URL
|
|
73
|
+
- `async response_code() -> int`: Get the HTTP response code
|
|
74
|
+
- `async markdown() -> str`: Get the content converted to Markdown
|
|
75
|
+
- `async soup() -> BeautifulSoup`: Get a BeautifulSoup object for the HTML content
|
|
76
|
+
|
|
77
|
+
### **kwargs Keywords
|
|
78
|
+
|
|
79
|
+
The GhostScraper constructor accepts any keyword arguments and passes them directly to the underlying PlaywrightScraper. This allows you to customize the browser behavior without directly interacting with the PlaywrightScraper class.
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
# GhostScraper accepts all these keyword arguments which are passed to PlaywrightScraper
|
|
83
|
+
scraper = GhostScraper(
|
|
84
|
+
url="https://example.com",
|
|
85
|
+
browser_type="chromium", # Browser to use: "chromium", "firefox", or "webkit"
|
|
86
|
+
headless=True, # Run browser in headless mode
|
|
87
|
+
browser_args={}, # Arguments for browser launcher
|
|
88
|
+
context_args={}, # Arguments for browser context
|
|
89
|
+
max_retries=3, # Maximum retry attempts
|
|
90
|
+
backoff_factor=2.0, # Exponential backoff factor
|
|
91
|
+
network_idle_timeout=10000, # Network idle timeout (ms)
|
|
92
|
+
load_timeout=30000, # Page load timeout (ms)
|
|
93
|
+
wait_for_selectors=[] # CSS selectors to wait for
|
|
94
|
+
)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
These keyword arguments configure how the page is loaded, browser behavior, and retry mechanisms.
|
|
98
|
+
|
|
99
|
+
## Advanced Usage
|
|
100
|
+
|
|
101
|
+
### Custom Markdown Options
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
from ghostscraper import GhostScraper
|
|
105
|
+
|
|
106
|
+
# Configure the Markdown converter
|
|
107
|
+
markdown_options = {
|
|
108
|
+
"strip_tags": ["script", "style", "nav", "footer", "header", "aside"],
|
|
109
|
+
"keep_tags": ["article", "main", "div", "section", "p"],
|
|
110
|
+
"content_selectors": ["article", "main", ".content", "#content"],
|
|
111
|
+
"preserve_images": True,
|
|
112
|
+
"preserve_links": True,
|
|
113
|
+
"preserve_tables": True,
|
|
114
|
+
"include_title": True,
|
|
115
|
+
"compact_output": False
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
# Create a scraper with custom Markdown options
|
|
119
|
+
scraper = GhostScraper(
|
|
120
|
+
url="https://example.com",
|
|
121
|
+
markdown_options=markdown_options
|
|
122
|
+
)
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Custom Browser Configuration
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from ghostscraper import GhostScraper
|
|
129
|
+
|
|
130
|
+
# Create a scraper with custom browser settings
|
|
131
|
+
scraper = GhostScraper(
|
|
132
|
+
url="https://example.com",
|
|
133
|
+
# Browser configuration options (passed to PlaywrightScraper)
|
|
134
|
+
browser_type="firefox", # Use Firefox instead of Chromium
|
|
135
|
+
headless=False, # Show the browser window
|
|
136
|
+
max_retries=5, # Increase retry attempts
|
|
137
|
+
load_timeout=60000, # Increase load timeout to 60 seconds
|
|
138
|
+
wait_for_selectors=[".content", ".main-article"] # Wait for these selectors
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# You can also pass browser-specific arguments
|
|
142
|
+
scraper = GhostScraper(
|
|
143
|
+
url="https://example.com",
|
|
144
|
+
browser_args={
|
|
145
|
+
"proxy": { # Set up a proxy
|
|
146
|
+
"server": "http://myproxy.com:8080",
|
|
147
|
+
"username": "user",
|
|
148
|
+
"password": "pass"
|
|
149
|
+
},
|
|
150
|
+
"slowMo": 50, # Slow down browser operations by 50ms
|
|
151
|
+
},
|
|
152
|
+
context_args={
|
|
153
|
+
"userAgent": "Custom User Agent", # Set custom user agent
|
|
154
|
+
"viewport": {"width": 1920, "height": 1080} # Set viewport size
|
|
155
|
+
}
|
|
156
|
+
)
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### Progressive Loading Strategy
|
|
160
|
+
|
|
161
|
+
GhostScraper uses a progressive loading strategy that tries different methods to load the page:
|
|
162
|
+
|
|
163
|
+
1. First tries with `networkidle` - waits until network is idle
|
|
164
|
+
2. If that fails, tries with `load` - waits for the load event
|
|
165
|
+
3. If that fails, tries with `domcontentloaded` - waits for DOM content loaded
|
|
166
|
+
|
|
167
|
+
This ensures maximum compatibility with different websites.
|
|
168
|
+
|
|
169
|
+
### Browser Installation
|
|
170
|
+
|
|
171
|
+
GhostScraper automatically checks if the required browser is installed and installs it if needed:
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
# Install browsers manually if needed
|
|
175
|
+
from ghostscraper import install_browser
|
|
176
|
+
|
|
177
|
+
# Install a specific browser type
|
|
178
|
+
install_browser("chromium")
|
|
179
|
+
install_browser("firefox")
|
|
180
|
+
install_browser("webkit")
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
### Using Caching
|
|
184
|
+
|
|
185
|
+
By default, GhostScraper caches results in the `data/ghostscraper` directory. To clear the cache:
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
# Clear cache for a specific URL
|
|
189
|
+
scraper = GhostScraper(url="https://example.com", clear_cache=True)
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
## License
|
|
193
|
+
|
|
194
|
+
MIT
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
{
|
|
2
|
+
"_json_cache_func_cache": {},
|
|
3
|
+
"_json_cache_variable_cache": {
|
|
4
|
+
"_html": "<!DOCTYPE html><html><head>\n <title>Example Domain</title>\n\n <meta charset=\"utf-8\">\n <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\">\n <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\">\n <style type=\"text/css\">\n body {\n background-color: #f0f0f2;\n margin: 0;\n padding: 0;\n font-family: -apple-system, system-ui, BlinkMacSystemFont, \"Segoe UI\", \"Open Sans\", \"Helvetica Neue\", Helvetica, Arial, sans-serif;\n \n }\n div {\n width: 600px;\n margin: 5em auto;\n padding: 2em;\n background-color: #fdfdff;\n border-radius: 0.5em;\n box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n }\n a:link, a:visited {\n color: #38488f;\n text-decoration: none;\n }\n @media (max-width: 700px) {\n div {\n margin: 0 auto;\n width: auto;\n }\n }\n </style> \n</head>\n\n<body>\n<div>\n <h1>Example Domain</h1>\n <p>This domain is for use in illustrative examples in documents. You may use this\n domain in literature without prior coordination or asking for permission.</p>\n <p><a href=\"https://www.iana.org/domains/example\">More information...</a></p>\n</div>\n\n\n</body></html>",
|
|
5
|
+
"_markdown": null,
|
|
6
|
+
"_markdown_options": {},
|
|
7
|
+
"_response_code": 200,
|
|
8
|
+
"_soup": null,
|
|
9
|
+
"kwargs": {},
|
|
10
|
+
"url": "https://www.example.com"
|
|
11
|
+
}
|
|
12
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from logorator import Logger
|
|
2
|
+
from typing import Any, Dict, Optional
|
|
3
|
+
|
|
4
|
+
from bs4 import BeautifulSoup
|
|
5
|
+
from cacherator import Cached, JSONCache
|
|
6
|
+
from slugify import slugify
|
|
7
|
+
|
|
8
|
+
from .markdown_converter import MarkdownConverter
|
|
9
|
+
from .playwright_scraper import PlaywrightScraper
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class GhostScraper(JSONCache):
|
|
13
|
+
def __init__(self, url="", clear_cache=False, ttl=999,markdown_options: Optional[Dict[str, Any]] = None, **kwargs):
|
|
14
|
+
self.url = url
|
|
15
|
+
self._html: str | None = None
|
|
16
|
+
self._soup: BeautifulSoup | None = None
|
|
17
|
+
self._markdown: str | None = None
|
|
18
|
+
self._response_code: int | None = None
|
|
19
|
+
self.kwargs = kwargs
|
|
20
|
+
self._markdown_options = markdown_options or {}
|
|
21
|
+
|
|
22
|
+
JSONCache.__init__(self, data_id=f"{slugify(self.url)}", directory="data/ghostscraper", clear_cache=clear_cache, ttl=ttl)
|
|
23
|
+
|
|
24
|
+
def __str__(self):
|
|
25
|
+
return f"{self.url}"
|
|
26
|
+
|
|
27
|
+
def __repr__(self):
|
|
28
|
+
return self.__str__()
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
@Cached()
|
|
32
|
+
def _playwright_scraper(self):
|
|
33
|
+
return PlaywrightScraper(url=self.url, **self.kwargs)
|
|
34
|
+
|
|
35
|
+
@Logger(override_function_name="Fetching URL via Playwright")
|
|
36
|
+
async def _fetch_response(self):
|
|
37
|
+
return await self._playwright_scraper.fetch_and_close()
|
|
38
|
+
|
|
39
|
+
async def get_response(self):
|
|
40
|
+
if self._response_code is None or self._html is None:
|
|
41
|
+
(self._html, self._response_code) = await self._fetch_response()
|
|
42
|
+
return {"html": self._html, "response_code": self._response_code}
|
|
43
|
+
|
|
44
|
+
async def html(self):
|
|
45
|
+
return (await self.get_response())["html"]
|
|
46
|
+
|
|
47
|
+
async def response_code(self):
|
|
48
|
+
return (await self.get_response())["response_code"]
|
|
49
|
+
|
|
50
|
+
async def markdown(self) -> str:
|
|
51
|
+
if self._markdown is None:
|
|
52
|
+
converter = MarkdownConverter(**self._markdown_options)
|
|
53
|
+
self._markdown = converter.convert(await self.html())
|
|
54
|
+
return self._markdown
|
|
55
|
+
|
|
56
|
+
async def soup(self) -> BeautifulSoup:
|
|
57
|
+
if self._soup is None:
|
|
58
|
+
self._soup = BeautifulSoup(await self.html(), "html.parser")
|
|
59
|
+
return self._soup
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
from typing import Optional, Dict, Any, List, Union, Tuple, Set
|
|
2
|
+
from bs4 import BeautifulSoup, Tag, NavigableString
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class MarkdownConverter:
|
|
7
|
+
def __init__(
|
|
8
|
+
self,
|
|
9
|
+
strip_tags: Optional[List[str]] = None,
|
|
10
|
+
keep_tags: Optional[List[str]] = None,
|
|
11
|
+
content_selectors: Optional[List[str]] = None,
|
|
12
|
+
preserve_images: bool = True,
|
|
13
|
+
preserve_links: bool = True,
|
|
14
|
+
preserve_tables: bool = True,
|
|
15
|
+
include_title: bool = True,
|
|
16
|
+
compact_output: bool = False
|
|
17
|
+
):
|
|
18
|
+
self.strip_tags = strip_tags or ["script", "style", "nav", "footer", "header", "aside", "iframe", "noscript"]
|
|
19
|
+
self.keep_tags = keep_tags or ["article", "main", "div", "section", "p", "h1", "h2", "h3", "h4", "h5", "h6"]
|
|
20
|
+
self.content_selectors = content_selectors or [
|
|
21
|
+
"article", "main", ".content", "#content", ".post-content",
|
|
22
|
+
".article-content", ".entry-content", "[role='main']"
|
|
23
|
+
]
|
|
24
|
+
self.preserve_images = preserve_images
|
|
25
|
+
self.preserve_links = preserve_links
|
|
26
|
+
self.preserve_tables = preserve_tables
|
|
27
|
+
self.include_title = include_title
|
|
28
|
+
self.compact_output = compact_output
|
|
29
|
+
|
|
30
|
+
def _extract_title(self, soup: BeautifulSoup) -> str:
|
|
31
|
+
title_tag = soup.title
|
|
32
|
+
if title_tag:
|
|
33
|
+
return title_tag.string.strip()
|
|
34
|
+
h1_tag = soup.find("h1")
|
|
35
|
+
if h1_tag:
|
|
36
|
+
return h1_tag.get_text().strip()
|
|
37
|
+
return ""
|
|
38
|
+
|
|
39
|
+
def _clean_text(self, text: str) -> str:
|
|
40
|
+
text = re.sub(r'\s+', ' ', text).strip()
|
|
41
|
+
text = re.sub(r'\n\s*\n', '\n\n', text)
|
|
42
|
+
return text
|
|
43
|
+
|
|
44
|
+
def _handle_heading(self, tag: Tag, level: int) -> str:
|
|
45
|
+
text = tag.get_text().strip()
|
|
46
|
+
return f"{'#' * level} {text}\n\n"
|
|
47
|
+
|
|
48
|
+
def _handle_paragraph(self, tag: Tag) -> str:
|
|
49
|
+
text = tag.get_text().strip()
|
|
50
|
+
if not text:
|
|
51
|
+
return ""
|
|
52
|
+
return f"{text}\n\n"
|
|
53
|
+
|
|
54
|
+
def _handle_list(self, tag: Tag, ordered: bool = False) -> str:
|
|
55
|
+
result = []
|
|
56
|
+
for i, item in enumerate(tag.find_all("li", recursive=False)):
|
|
57
|
+
prefix = f"{i + 1}. " if ordered else "* "
|
|
58
|
+
text = item.get_text().strip()
|
|
59
|
+
result.append(f"{prefix}{text}")
|
|
60
|
+
return "\n".join(result) + "\n\n"
|
|
61
|
+
|
|
62
|
+
def _handle_link(self, tag: Tag) -> str:
|
|
63
|
+
if not self.preserve_links:
|
|
64
|
+
return tag.get_text().strip()
|
|
65
|
+
|
|
66
|
+
text = tag.get_text().strip()
|
|
67
|
+
href = tag.get("href", "")
|
|
68
|
+
title = tag.get("title", "")
|
|
69
|
+
|
|
70
|
+
if not href or not text:
|
|
71
|
+
return text
|
|
72
|
+
|
|
73
|
+
if title:
|
|
74
|
+
return f"[{text}]({href} \"{title}\")"
|
|
75
|
+
return f"[{text}]({href})"
|
|
76
|
+
|
|
77
|
+
def _handle_image(self, tag: Tag) -> str:
|
|
78
|
+
if not self.preserve_images:
|
|
79
|
+
return ""
|
|
80
|
+
|
|
81
|
+
alt = tag.get("alt", "")
|
|
82
|
+
src = tag.get("src", "")
|
|
83
|
+
title = tag.get("title", "")
|
|
84
|
+
|
|
85
|
+
if not src:
|
|
86
|
+
return ""
|
|
87
|
+
|
|
88
|
+
if src.startswith("/"):
|
|
89
|
+
parent_link = tag.find_parent("a")
|
|
90
|
+
if parent_link and parent_link.get("href"):
|
|
91
|
+
href = parent_link.get("href", "")
|
|
92
|
+
if href.startswith("http"):
|
|
93
|
+
base = href.split("//")[0] + "//" + href.split("//")[1].split("/")[0]
|
|
94
|
+
src = base + src
|
|
95
|
+
|
|
96
|
+
if title:
|
|
97
|
+
return f""
|
|
98
|
+
return f""
|
|
99
|
+
|
|
100
|
+
def _handle_table(self, tag: Tag) -> str:
|
|
101
|
+
if not self.preserve_tables:
|
|
102
|
+
return tag.get_text().strip() + "\n\n"
|
|
103
|
+
|
|
104
|
+
result = []
|
|
105
|
+
|
|
106
|
+
headers = []
|
|
107
|
+
header_row = tag.find("thead")
|
|
108
|
+
if header_row:
|
|
109
|
+
for th in header_row.find_all("th"):
|
|
110
|
+
headers.append(th.get_text().strip())
|
|
111
|
+
|
|
112
|
+
if not headers and tag.find("tr"):
|
|
113
|
+
first_row = tag.find("tr")
|
|
114
|
+
for cell in first_row.find_all(["th", "td"]):
|
|
115
|
+
headers.append(cell.get_text().strip())
|
|
116
|
+
|
|
117
|
+
if not headers:
|
|
118
|
+
first_row = tag.find("tr")
|
|
119
|
+
if first_row:
|
|
120
|
+
cell_count = len(first_row.find_all(["td", "th"]))
|
|
121
|
+
headers = [f"Column {i + 1}" for i in range(cell_count)]
|
|
122
|
+
else:
|
|
123
|
+
return tag.get_text().strip() + "\n\n"
|
|
124
|
+
|
|
125
|
+
result.append("| " + " | ".join(headers) + " |")
|
|
126
|
+
result.append("| " + " | ".join(["---"] * len(headers)) + " |")
|
|
127
|
+
|
|
128
|
+
body = tag.find("tbody") or tag
|
|
129
|
+
for row in body.find_all("tr"):
|
|
130
|
+
if not header_row and row == tag.find("tr"):
|
|
131
|
+
continue
|
|
132
|
+
|
|
133
|
+
cells = []
|
|
134
|
+
row_cells = row.find_all(["td", "th"])
|
|
135
|
+
|
|
136
|
+
for cell in row_cells:
|
|
137
|
+
content = cell.get_text().strip()
|
|
138
|
+
colspan = int(cell.get("colspan", 1))
|
|
139
|
+
if colspan > 1:
|
|
140
|
+
cells.extend([content] + [""] * (colspan - 1))
|
|
141
|
+
else:
|
|
142
|
+
cells.append(content)
|
|
143
|
+
|
|
144
|
+
while len(cells) < len(headers):
|
|
145
|
+
cells.append("")
|
|
146
|
+
|
|
147
|
+
cells = cells[:len(headers)]
|
|
148
|
+
|
|
149
|
+
if cells:
|
|
150
|
+
result.append("| " + " | ".join(cells) + " |")
|
|
151
|
+
|
|
152
|
+
return "\n".join(result) + "\n\n"
|
|
153
|
+
|
|
154
|
+
def _handle_blockquote(self, tag: Tag) -> str:
|
|
155
|
+
lines = tag.get_text().strip().split("\n")
|
|
156
|
+
result = []
|
|
157
|
+
for line in lines:
|
|
158
|
+
result.append(f"> {line}")
|
|
159
|
+
return "\n".join(result) + "\n\n"
|
|
160
|
+
|
|
161
|
+
def _handle_code(self, tag: Tag) -> str:
|
|
162
|
+
language = tag.get("class", [""])[0].replace("language-", "") if tag.get("class") else ""
|
|
163
|
+
code = tag.get_text()
|
|
164
|
+
if language:
|
|
165
|
+
return f"```{language}\n{code}\n```\n\n"
|
|
166
|
+
return f"```\n{code}\n```\n\n"
|
|
167
|
+
|
|
168
|
+
def _handle_inline_code(self, tag: Tag) -> str:
|
|
169
|
+
return f"`{tag.get_text()}`"
|
|
170
|
+
|
|
171
|
+
def _handle_strong(self, tag: Tag) -> str:
|
|
172
|
+
return f"**{tag.get_text()}**"
|
|
173
|
+
|
|
174
|
+
def _handle_em(self, tag: Tag) -> str:
|
|
175
|
+
return f"*{tag.get_text()}*"
|
|
176
|
+
|
|
177
|
+
def _handle_hr(self, tag: Tag) -> str:
|
|
178
|
+
return "---\n\n"
|
|
179
|
+
|
|
180
|
+
def _process_node(self, node: Union[Tag, NavigableString]) -> str:
|
|
181
|
+
if isinstance(node, NavigableString):
|
|
182
|
+
return str(node)
|
|
183
|
+
|
|
184
|
+
tag_name = node.name
|
|
185
|
+
|
|
186
|
+
if tag_name in self.strip_tags:
|
|
187
|
+
return ""
|
|
188
|
+
|
|
189
|
+
handlers = {
|
|
190
|
+
"h1" : lambda t: self._handle_heading(t, 1),
|
|
191
|
+
"h2" : lambda t: self._handle_heading(t, 2),
|
|
192
|
+
"h3" : lambda t: self._handle_heading(t, 3),
|
|
193
|
+
"h4" : lambda t: self._handle_heading(t, 4),
|
|
194
|
+
"h5" : lambda t: self._handle_heading(t, 5),
|
|
195
|
+
"h6" : lambda t: self._handle_heading(t, 6),
|
|
196
|
+
"p" : self._handle_paragraph,
|
|
197
|
+
"ul" : lambda t: self._handle_list(t, ordered=False),
|
|
198
|
+
"ol" : lambda t: self._handle_list(t, ordered=True),
|
|
199
|
+
"a" : self._handle_link,
|
|
200
|
+
"img" : self._handle_image,
|
|
201
|
+
"table" : self._handle_table,
|
|
202
|
+
"blockquote": self._handle_blockquote,
|
|
203
|
+
"pre" : self._handle_code,
|
|
204
|
+
"code" : self._handle_inline_code,
|
|
205
|
+
"strong" : self._handle_strong,
|
|
206
|
+
"b" : self._handle_strong,
|
|
207
|
+
"em" : self._handle_em,
|
|
208
|
+
"i" : self._handle_em,
|
|
209
|
+
"hr" : self._handle_hr,
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
if tag_name in handlers:
|
|
213
|
+
return handlers[tag_name](node)
|
|
214
|
+
|
|
215
|
+
result = ""
|
|
216
|
+
for child in node.children:
|
|
217
|
+
result += self._process_node(child)
|
|
218
|
+
|
|
219
|
+
return result
|
|
220
|
+
|
|
221
|
+
def _find_content_container(self, soup: BeautifulSoup) -> Optional[Tag]:
|
|
222
|
+
for selector in self.content_selectors:
|
|
223
|
+
if selector.startswith("."):
|
|
224
|
+
containers = soup.find_all(class_=selector[1:])
|
|
225
|
+
elif selector.startswith("#"):
|
|
226
|
+
container = soup.find(id=selector[1:])
|
|
227
|
+
containers = [container] if container else []
|
|
228
|
+
elif "[" in selector and "]" in selector:
|
|
229
|
+
attr_name = selector.split("[")[1].split("=")[0]
|
|
230
|
+
attr_value = selector.split("=")[1].split("]")[0].strip("'\"")
|
|
231
|
+
containers = soup.find_all(attrs={attr_name: attr_value})
|
|
232
|
+
else:
|
|
233
|
+
containers = soup.find_all(selector)
|
|
234
|
+
|
|
235
|
+
if containers:
|
|
236
|
+
if len(containers) == 1:
|
|
237
|
+
return containers[0]
|
|
238
|
+
|
|
239
|
+
containers_with_length = [(c, len(c.get_text())) for c in containers]
|
|
240
|
+
containers_with_length.sort(key=lambda x: x[1], reverse=True)
|
|
241
|
+
return containers_with_length[0][0]
|
|
242
|
+
|
|
243
|
+
for tag_name in self.keep_tags:
|
|
244
|
+
tags = soup.find_all(tag_name)
|
|
245
|
+
|
|
246
|
+
if tags:
|
|
247
|
+
tags_with_length = [(tag, len(tag.get_text())) for tag in tags]
|
|
248
|
+
tags_with_length.sort(key=lambda x: x[1], reverse=True)
|
|
249
|
+
return tags_with_length[0][0]
|
|
250
|
+
|
|
251
|
+
return soup.body
|
|
252
|
+
|
|
253
|
+
def convert(self, html: str) -> str:
|
|
254
|
+
if not html:
|
|
255
|
+
return ""
|
|
256
|
+
|
|
257
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
258
|
+
|
|
259
|
+
for tag_name in self.strip_tags:
|
|
260
|
+
for tag in soup.find_all(tag_name):
|
|
261
|
+
tag.decompose()
|
|
262
|
+
|
|
263
|
+
content = self._find_content_container(soup)
|
|
264
|
+
if not content:
|
|
265
|
+
content = soup
|
|
266
|
+
|
|
267
|
+
title = self._extract_title(soup) if self.include_title else ""
|
|
268
|
+
result = f"# {title}\n\n" if title else ""
|
|
269
|
+
|
|
270
|
+
markdown = result + self._process_node(content)
|
|
271
|
+
|
|
272
|
+
markdown = re.sub(r'\n{3,}', '\n\n', markdown)
|
|
273
|
+
|
|
274
|
+
if self.compact_output:
|
|
275
|
+
markdown = re.sub(r'\n\n+', '\n\n', markdown)
|
|
276
|
+
|
|
277
|
+
return markdown.strip()
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import sys
|
|
3
|
+
import os
|
|
4
|
+
from playwright.async_api import async_playwright, Browser, BrowserContext
|
|
5
|
+
from logorator import Logger
|
|
6
|
+
|
|
7
|
+
async def check_browser_installed(browser_name: str) -> bool:
|
|
8
|
+
async with async_playwright() as p:
|
|
9
|
+
browsers = {"chromium": p.chromium, "firefox": p.firefox, "webkit": p.webkit, }
|
|
10
|
+
if browser_name not in browsers:
|
|
11
|
+
Logger.note(f"❌ Invalid browser name: {browser_name}")
|
|
12
|
+
return False
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
browser = await browsers[browser_name].launch()
|
|
16
|
+
await browser.close()
|
|
17
|
+
Logger.note(f"✅ {browser_name} is installed and working!")
|
|
18
|
+
return True
|
|
19
|
+
except Exception as e:
|
|
20
|
+
Logger.note(f"❌ {browser_name} is NOT installed or failed to launch: {e}")
|
|
21
|
+
return False
|
|
22
|
+
|
|
23
|
+
@Logger()
|
|
24
|
+
def install_browser(browser_type: str) -> bool:
|
|
25
|
+
try:
|
|
26
|
+
Logger.note(f"\n[Ghostscraper] Installing {browser_type} browser (first-time setup)")
|
|
27
|
+
Logger.note("[Ghostscraper] This may take a few minutes...")
|
|
28
|
+
|
|
29
|
+
subprocess.check_call([
|
|
30
|
+
sys.executable, "-m", "playwright", "install", browser_type
|
|
31
|
+
])
|
|
32
|
+
|
|
33
|
+
Logger.note(f"[Ghostscraper] Successfully installed {browser_type} browser.")
|
|
34
|
+
return True
|
|
35
|
+
|
|
36
|
+
except subprocess.CalledProcessError as e:
|
|
37
|
+
Logger.note(f"\n[Ghostscraper] Failed to install {browser_type} browser. Error code: {e.returncode}")
|
|
38
|
+
|
|
39
|
+
if os.name == 'posix' and os.geteuid() != 0:
|
|
40
|
+
Logger.note("[Ghostscraper] You may need to run with sudo privileges.")
|
|
41
|
+
Logger.note(f"[Ghostscraper] Try: sudo playwright install {browser_type}")
|
|
42
|
+
else:
|
|
43
|
+
Logger.note("[Ghostscraper] You may need administrator privileges.")
|
|
44
|
+
Logger.note(f"[Ghostscraper] Try running: playwright install {browser_type}")
|
|
45
|
+
|
|
46
|
+
return False
|
|
47
|
+
|
|
48
|
+
except Exception as e:
|
|
49
|
+
Logger.note(f"\n[Ghostscraper] An unexpected error occurred: {str(e)}")
|
|
50
|
+
Logger.note(f"[Ghostscraper] Please run 'playwright install {browser_type}' manually.")
|
|
51
|
+
return False
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import Any, Dict, List, Literal, Optional, Tuple
|
|
3
|
+
|
|
4
|
+
from logorator import Logger
|
|
5
|
+
from playwright.async_api import async_playwright, Browser, BrowserContext, Page, Playwright, TimeoutError as PlaywrightTimeoutError
|
|
6
|
+
|
|
7
|
+
from .playwright_installer import check_browser_installed, install_browser
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class PlaywrightScraper:
|
|
11
|
+
BROWSERS_CHECKED = {}
|
|
12
|
+
|
|
13
|
+
def __init__(self, url: str = "", browser_type: Literal["chromium", "firefox", "webkit"] = "chromium", headless: bool = True, browser_args: Optional[Dict[str, Any]] = None,
|
|
14
|
+
context_args: Optional[Dict[str, Any]] = None, max_retries: int = 3, backoff_factor: float = 2.0, network_idle_timeout: int = 10000, # 10 seconds
|
|
15
|
+
load_timeout: int = 30000, # 30 seconds
|
|
16
|
+
wait_for_selectors: Optional[List[str]] = None # CSS selectors to wait for
|
|
17
|
+
):
|
|
18
|
+
self.url = url
|
|
19
|
+
self.browser_type: str = browser_type
|
|
20
|
+
self.headless: bool = headless
|
|
21
|
+
self.browser_args: Dict[str, Any] = browser_args or {}
|
|
22
|
+
self.context_args: Dict[str, Any] = context_args or {}
|
|
23
|
+
self.max_retries: int = max_retries
|
|
24
|
+
self.backoff_factor: float = backoff_factor
|
|
25
|
+
self.network_idle_timeout: int = network_idle_timeout
|
|
26
|
+
self.load_timeout: int = load_timeout
|
|
27
|
+
self.wait_for_selectors: List[str] = wait_for_selectors or []
|
|
28
|
+
self._playwright: Optional[Playwright] = None
|
|
29
|
+
self._browser: Optional[Browser] = None
|
|
30
|
+
self._context: Optional[BrowserContext] = None
|
|
31
|
+
self.last_status_code: int = 200
|
|
32
|
+
|
|
33
|
+
def __str__(self):
|
|
34
|
+
return self.url
|
|
35
|
+
|
|
36
|
+
def __repr__(self):
|
|
37
|
+
return self.__str__()
|
|
38
|
+
|
|
39
|
+
async def check_and_install_browser(self):
|
|
40
|
+
if PlaywrightScraper.BROWSERS_CHECKED.get(self.browser_type) is not None:
|
|
41
|
+
return PlaywrightScraper.BROWSERS_CHECKED.get(self.browser_type)
|
|
42
|
+
if await check_browser_installed(self.browser_type):
|
|
43
|
+
PlaywrightScraper.BROWSERS_CHECKED[self.browser_type] = True
|
|
44
|
+
return True
|
|
45
|
+
else:
|
|
46
|
+
install_browser(self.browser_type)
|
|
47
|
+
PlaywrightScraper.BROWSERS_CHECKED[self.browser_type] = asyncio.run(check_browser_installed(self.browser_type))
|
|
48
|
+
return PlaywrightScraper.BROWSERS_CHECKED[self.browser_type]
|
|
49
|
+
|
|
50
|
+
async def _ensure_browser(self) -> None:
|
|
51
|
+
await self.check_and_install_browser()
|
|
52
|
+
if self._playwright is None:
|
|
53
|
+
self._playwright = await async_playwright().start()
|
|
54
|
+
|
|
55
|
+
if self.browser_type == "chromium":
|
|
56
|
+
browser_launcher = self._playwright.chromium
|
|
57
|
+
elif self.browser_type == "firefox":
|
|
58
|
+
browser_launcher = self._playwright.firefox
|
|
59
|
+
elif self.browser_type == "webkit":
|
|
60
|
+
browser_launcher = self._playwright.webkit
|
|
61
|
+
else:
|
|
62
|
+
raise ValueError(f"Unknown browser type: {self.browser_type}")
|
|
63
|
+
|
|
64
|
+
self._browser = await browser_launcher.launch(headless=self.headless, **self.browser_args)
|
|
65
|
+
|
|
66
|
+
self._context = await self._browser.new_context(**self.context_args)
|
|
67
|
+
|
|
68
|
+
async def _try_progressive_load(self, page: Page, url: str) -> Tuple[bool, int]:
|
|
69
|
+
# Strategy 1: Try with networkidle first (strictest, but most reliable)
|
|
70
|
+
try:
|
|
71
|
+
Logger.note(f"GhostScraper: Attempting to load with 'networkidle' (timeout: {self.network_idle_timeout}ms)")
|
|
72
|
+
response = await page.goto(url, wait_until="networkidle", timeout=self.network_idle_timeout)
|
|
73
|
+
status_code = response.status if response else 200
|
|
74
|
+
return True, status_code
|
|
75
|
+
except PlaywrightTimeoutError:
|
|
76
|
+
Logger.note("GhostScraper: 'networkidle' timed out, falling back to 'load'")
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
# Strategy 2: Fallback to load event (less strict)
|
|
80
|
+
try:
|
|
81
|
+
Logger.note(f"GhostScraper: Attempting to load with 'load' (timeout: {self.load_timeout}ms)")
|
|
82
|
+
response = await page.goto(url, wait_until="load", timeout=self.load_timeout)
|
|
83
|
+
status_code = response.status if response else 200
|
|
84
|
+
return True, status_code
|
|
85
|
+
except PlaywrightTimeoutError:
|
|
86
|
+
Logger.note("GhostScraper: 'load' timed out, falling back to 'domcontentloaded'")
|
|
87
|
+
pass
|
|
88
|
+
|
|
89
|
+
# Strategy 3: Fallback to domcontentloaded (least strict)
|
|
90
|
+
try:
|
|
91
|
+
Logger.note("GhostScraper: Attempting to load with 'domcontentloaded'")
|
|
92
|
+
response = await page.goto(url, wait_until="domcontentloaded", timeout=self.load_timeout)
|
|
93
|
+
status_code = response.status if response else 200
|
|
94
|
+
return True, status_code
|
|
95
|
+
except PlaywrightTimeoutError:
|
|
96
|
+
Logger.note("GhostScraper: All loading strategies failed")
|
|
97
|
+
return False, 408 # Request Timeout
|
|
98
|
+
|
|
99
|
+
async def _wait_for_selectors(self, page: Page) -> bool:
|
|
100
|
+
if not self.wait_for_selectors:
|
|
101
|
+
return True
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
for selector in self.wait_for_selectors:
|
|
105
|
+
try:
|
|
106
|
+
Logger.note(f"GhostScraper: Waiting for selector '{selector}'")
|
|
107
|
+
await page.wait_for_selector(selector, timeout=5000)
|
|
108
|
+
Logger.note(f"GhostScraper: Found selector '{selector}'")
|
|
109
|
+
except PlaywrightTimeoutError:
|
|
110
|
+
Logger.note(f"GhostScraper: Selector '{selector}' not found, continuing anyway")
|
|
111
|
+
return True
|
|
112
|
+
except Exception as e:
|
|
113
|
+
Logger.note(f"GhostScraper: Error waiting for selectors: {str(e)}")
|
|
114
|
+
return False
|
|
115
|
+
|
|
116
|
+
async def fetch(self) -> Tuple[str, int]:
|
|
117
|
+
await self._ensure_browser()
|
|
118
|
+
attempts = 0
|
|
119
|
+
|
|
120
|
+
while attempts <= self.max_retries:
|
|
121
|
+
page: Page = await self._context.new_page()
|
|
122
|
+
try:
|
|
123
|
+
# Set a default navigation timeout
|
|
124
|
+
page.set_default_navigation_timeout(self.load_timeout)
|
|
125
|
+
# Try progressive loading strategies
|
|
126
|
+
load_success, status_code = await self._try_progressive_load(page, self.url)
|
|
127
|
+
self.last_status_code = status_code
|
|
128
|
+
|
|
129
|
+
if not load_success:
|
|
130
|
+
if attempts == self.max_retries:
|
|
131
|
+
Logger.note(f"GhostScraper: Max retries reached. All loading strategies failed.")
|
|
132
|
+
return "", 408
|
|
133
|
+
wait_time = self.backoff_factor ** attempts
|
|
134
|
+
Logger.note(f"GhostScraper: All loading strategies failed. Retrying in {wait_time:.2f}s (attempt {attempts + 1}/{self.max_retries})")
|
|
135
|
+
await asyncio.sleep(wait_time)
|
|
136
|
+
attempts += 1
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
if status_code >= 400:
|
|
140
|
+
if attempts == self.max_retries:
|
|
141
|
+
Logger.note(f"GhostScraper: Max retries reached with status code {status_code}. Returning empty response.")
|
|
142
|
+
return "", status_code
|
|
143
|
+
|
|
144
|
+
wait_time = self.backoff_factor ** attempts
|
|
145
|
+
Logger.note(f"GhostScraper: Status code {status_code} received. Retrying in {wait_time:.2f}s (attempt {attempts + 1}/{self.max_retries})")
|
|
146
|
+
await asyncio.sleep(wait_time)
|
|
147
|
+
attempts += 1
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
# Try to wait for specified selectors (if any)
|
|
151
|
+
await self._wait_for_selectors(page)
|
|
152
|
+
|
|
153
|
+
# If we reached here, we consider it a success. Grab the content and return.
|
|
154
|
+
html: str = await page.content()
|
|
155
|
+
return html, status_code
|
|
156
|
+
|
|
157
|
+
except PlaywrightTimeoutError as e:
|
|
158
|
+
if attempts == self.max_retries:
|
|
159
|
+
Logger.note(f"GhostScraper: Max retries reached after timeout. Returning empty response with 408 status.")
|
|
160
|
+
return "", 408
|
|
161
|
+
|
|
162
|
+
wait_time = self.backoff_factor ** attempts
|
|
163
|
+
Logger.note(f"GhostScraper: Timeout error occurred: {str(e)}. Retrying in {wait_time:.2f}s (attempt {attempts + 1}/{self.max_retries})")
|
|
164
|
+
await asyncio.sleep(wait_time)
|
|
165
|
+
attempts += 1
|
|
166
|
+
|
|
167
|
+
except Exception as e:
|
|
168
|
+
if attempts == self.max_retries:
|
|
169
|
+
Logger.note(f"GhostScraper: Max retries reached after exception: {str(e)}. Returning empty response with 500 status.")
|
|
170
|
+
return "", 500
|
|
171
|
+
|
|
172
|
+
wait_time = self.backoff_factor ** attempts
|
|
173
|
+
Logger.note(f"GhostScraper: Exception occurred: {str(e)}. Retrying in {wait_time:.2f}s (attempt {attempts + 1}/{self.max_retries})")
|
|
174
|
+
await asyncio.sleep(wait_time)
|
|
175
|
+
attempts += 1
|
|
176
|
+
|
|
177
|
+
finally:
|
|
178
|
+
await page.close()
|
|
179
|
+
|
|
180
|
+
# This should not be reached, but just in case
|
|
181
|
+
return "", 500
|
|
182
|
+
|
|
183
|
+
async def close(self) -> None:
|
|
184
|
+
if self._context:
|
|
185
|
+
await self._context.close()
|
|
186
|
+
self._context = None
|
|
187
|
+
|
|
188
|
+
if self._browser:
|
|
189
|
+
await self._browser.close()
|
|
190
|
+
self._browser = None
|
|
191
|
+
|
|
192
|
+
if self._playwright:
|
|
193
|
+
await self._playwright.stop()
|
|
194
|
+
self._playwright = None
|
|
195
|
+
|
|
196
|
+
async def fetch_and_close(self) -> Tuple[str, int]:
|
|
197
|
+
try:
|
|
198
|
+
return await self.fetch()
|
|
199
|
+
finally:
|
|
200
|
+
await self.close()
|
|
201
|
+
|
|
202
|
+
async def __aenter__(self):
|
|
203
|
+
await self._ensure_browser()
|
|
204
|
+
return self
|
|
205
|
+
|
|
206
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
207
|
+
await self.close()
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "ghostscraper"
|
|
7
|
+
version = "0.0.1"
|
|
8
|
+
description = "An asynchronous web scraper using Playwright with HTML to Markdown conversion"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
authors = [
|
|
11
|
+
{name = "Arved Klöhn", email = "arved.kloehn@gmail.com"},
|
|
12
|
+
]
|
|
13
|
+
license = {text = "MIT"}
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.8",
|
|
20
|
+
"Programming Language :: Python :: 3.9",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
24
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
25
|
+
"Topic :: Text Processing :: Markup :: HTML",
|
|
26
|
+
"Topic :: Text Processing :: Markup :: Markdown",
|
|
27
|
+
]
|
|
28
|
+
keywords = ["web scraping", "playwright", "markdown", "html", "converter", "async"]
|
|
29
|
+
dependencies = [
|
|
30
|
+
"playwright>=1.30.0",
|
|
31
|
+
"beautifulsoup4>=4.10.0",
|
|
32
|
+
"cacherator",
|
|
33
|
+
"logorator",
|
|
34
|
+
"python-slugify>=8.0.0",
|
|
35
|
+
]
|
|
36
|
+
requires-python = ">=3.8"
|
|
37
|
+
|
|
38
|
+
[project.urls]
|
|
39
|
+
Homepage = "https://github.com/Redundando/ghostscraper"
|
|
40
|
+
Issues = "https://github.com/Redundando/ghostscraper/issues"
|
|
41
|
+
|
|
42
|
+
[tool.hatch.build.targets.wheel]
|
|
43
|
+
packages = ["ghostscraper"]
|
|
44
|
+
|
|
45
|
+
[tool.hatch.build.targets.sdist]
|
|
46
|
+
include = ["ghostscraper"]
|