ghostscraper 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,223 @@
1
+ Metadata-Version: 2.4
2
+ Name: ghostscraper
3
+ Version: 0.0.1
4
+ Summary: An asynchronous web scraper using Playwright with HTML to Markdown conversion
5
+ Project-URL: Homepage, https://github.com/Redundando/ghostscraper
6
+ Project-URL: Issues, https://github.com/Redundando/ghostscraper/issues
7
+ Author-email: Arved Klöhn <arved.kloehn@gmail.com>
8
+ License: MIT
9
+ Keywords: async,converter,html,markdown,playwright,web scraping
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.8
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Topic :: Internet :: WWW/HTTP
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Classifier: Topic :: Text Processing :: Markup :: HTML
21
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
22
+ Requires-Python: >=3.8
23
+ Requires-Dist: beautifulsoup4>=4.10.0
24
+ Requires-Dist: cacherator
25
+ Requires-Dist: logorator
26
+ Requires-Dist: playwright>=1.30.0
27
+ Requires-Dist: python-slugify>=8.0.0
28
+ Description-Content-Type: text/markdown
29
+
30
+ # GhostScraper
31
+
32
+ GhostScraper is an asynchronous web scraping library built on top of Playwright that makes it easy to fetch and convert web content to Markdown format. It handles browser management, retries, and provides a clean interface for working with web content.
33
+
34
+ ## Features
35
+
36
+ - Asynchronous web scraping with Playwright
37
+ - HTML to Markdown conversion
38
+ - Built-in retry mechanism with exponential backoff
39
+ - Result caching using JSONCache
40
+ - Smart content extraction
41
+ - Support for multiple browser types (Chromium, Firefox, WebKit)
42
+
43
+ ## Installation
44
+
45
+ ```bash
46
+ pip install ghostscraper
47
+ ```
48
+
49
+ GhostScraper will automatically install and manage required browsers during the first run.
50
+
51
+ ## Basic Usage
52
+
53
+ ```python
54
+ import asyncio
55
+ from ghostscraper import GhostScraper
56
+
57
+ async def main():
58
+ # Create a scraper instance
59
+ scraper = GhostScraper(url="https://example.com")
60
+
61
+ # Get the HTML content
62
+ html = await scraper.html()
63
+
64
+ # Get the Markdown converted content
65
+ markdown = await scraper.markdown()
66
+
67
+ # Get the response code
68
+ status_code = await scraper.response_code()
69
+
70
+ print(f"Status code: {status_code}")
71
+ print(f"Markdown content:\n{markdown}")
72
+
73
+ # Run the async function
74
+ asyncio.run(main())
75
+ ```
76
+
77
+ ## API Reference
78
+
79
+ ### GhostScraper
80
+
81
+ The main class for scraping and converting web content.
82
+
83
+ #### Constructor
84
+
85
+ ```python
86
+ GhostScraper(
87
+ url: str = "",
88
+ clear_cache: bool = False,
89
+ markdown_options: Optional[Dict[str, Any]] = None,
90
+ **kwargs
91
+ )
92
+ ```
93
+
94
+ - `url`: The URL to scrape
95
+ - `clear_cache`: Whether to clear the cache before scraping
96
+ - `markdown_options`: Options for the Markdown converter
97
+ - `**kwargs`: Additional arguments passed to the PlaywrightScraper
98
+
99
+ #### Methods
100
+
101
+ - `async html() -> str`: Get the HTML content of the URL
102
+ - `async response_code() -> int`: Get the HTTP response code
103
+ - `async markdown() -> str`: Get the content converted to Markdown
104
+ - `async soup() -> BeautifulSoup`: Get a BeautifulSoup object for the HTML content
105
+
106
+ ### **kwargs Keywords
107
+
108
+ The GhostScraper constructor accepts any keyword arguments and passes them directly to the underlying PlaywrightScraper. This allows you to customize the browser behavior without directly interacting with the PlaywrightScraper class.
109
+
110
+ ```python
111
+ # GhostScraper accepts all these keyword arguments which are passed to PlaywrightScraper
112
+ scraper = GhostScraper(
113
+ url="https://example.com",
114
+ browser_type="chromium", # Browser to use: "chromium", "firefox", or "webkit"
115
+ headless=True, # Run browser in headless mode
116
+ browser_args={}, # Arguments for browser launcher
117
+ context_args={}, # Arguments for browser context
118
+ max_retries=3, # Maximum retry attempts
119
+ backoff_factor=2.0, # Exponential backoff factor
120
+ network_idle_timeout=10000, # Network idle timeout (ms)
121
+ load_timeout=30000, # Page load timeout (ms)
122
+ wait_for_selectors=[] # CSS selectors to wait for
123
+ )
124
+ ```
125
+
126
+ These keyword arguments configure how the page is loaded, browser behavior, and retry mechanisms.
127
+
128
+ ## Advanced Usage
129
+
130
+ ### Custom Markdown Options
131
+
132
+ ```python
133
+ from ghostscraper import GhostScraper
134
+
135
+ # Configure the Markdown converter
136
+ markdown_options = {
137
+ "strip_tags": ["script", "style", "nav", "footer", "header", "aside"],
138
+ "keep_tags": ["article", "main", "div", "section", "p"],
139
+ "content_selectors": ["article", "main", ".content", "#content"],
140
+ "preserve_images": True,
141
+ "preserve_links": True,
142
+ "preserve_tables": True,
143
+ "include_title": True,
144
+ "compact_output": False
145
+ }
146
+
147
+ # Create a scraper with custom Markdown options
148
+ scraper = GhostScraper(
149
+ url="https://example.com",
150
+ markdown_options=markdown_options
151
+ )
152
+ ```
153
+
154
+ ### Custom Browser Configuration
155
+
156
+ ```python
157
+ from ghostscraper import GhostScraper
158
+
159
+ # Create a scraper with custom browser settings
160
+ scraper = GhostScraper(
161
+ url="https://example.com",
162
+ # Browser configuration options (passed to PlaywrightScraper)
163
+ browser_type="firefox", # Use Firefox instead of Chromium
164
+ headless=False, # Show the browser window
165
+ max_retries=5, # Increase retry attempts
166
+ load_timeout=60000, # Increase load timeout to 60 seconds
167
+ wait_for_selectors=[".content", ".main-article"] # Wait for these selectors
168
+ )
169
+
170
+ # You can also pass browser-specific arguments
171
+ scraper = GhostScraper(
172
+ url="https://example.com",
173
+ browser_args={
174
+ "proxy": { # Set up a proxy
175
+ "server": "http://myproxy.com:8080",
176
+ "username": "user",
177
+ "password": "pass"
178
+ },
179
+ "slowMo": 50, # Slow down browser operations by 50ms
180
+ },
181
+ context_args={
182
+ "userAgent": "Custom User Agent", # Set custom user agent
183
+ "viewport": {"width": 1920, "height": 1080} # Set viewport size
184
+ }
185
+ )
186
+ ```
187
+
188
+ ### Progressive Loading Strategy
189
+
190
+ GhostScraper uses a progressive loading strategy that tries different methods to load the page:
191
+
192
+ 1. First tries with `networkidle` - waits until network is idle
193
+ 2. If that fails, tries with `load` - waits for the load event
194
+ 3. If that fails, tries with `domcontentloaded` - waits for DOM content loaded
195
+
196
+ This ensures maximum compatibility with different websites.
197
+
198
+ ### Browser Installation
199
+
200
+ GhostScraper automatically checks if the required browser is installed and installs it if needed:
201
+
202
+ ```python
203
+ # Install browsers manually if needed
204
+ from ghostscraper import install_browser
205
+
206
+ # Install a specific browser type
207
+ install_browser("chromium")
208
+ install_browser("firefox")
209
+ install_browser("webkit")
210
+ ```
211
+
212
+ ### Using Caching
213
+
214
+ By default, GhostScraper caches results in the `data/ghostscraper` directory. To clear the cache:
215
+
216
+ ```python
217
+ # Clear cache for a specific URL
218
+ scraper = GhostScraper(url="https://example.com", clear_cache=True)
219
+ ```
220
+
221
+ ## License
222
+
223
+ MIT
@@ -0,0 +1,194 @@
1
+ # GhostScraper
2
+
3
+ GhostScraper is an asynchronous web scraping library built on top of Playwright that makes it easy to fetch and convert web content to Markdown format. It handles browser management, retries, and provides a clean interface for working with web content.
4
+
5
+ ## Features
6
+
7
+ - Asynchronous web scraping with Playwright
8
+ - HTML to Markdown conversion
9
+ - Built-in retry mechanism with exponential backoff
10
+ - Result caching using JSONCache
11
+ - Smart content extraction
12
+ - Support for multiple browser types (Chromium, Firefox, WebKit)
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ pip install ghostscraper
18
+ ```
19
+
20
+ GhostScraper will automatically install and manage required browsers during the first run.
21
+
22
+ ## Basic Usage
23
+
24
+ ```python
25
+ import asyncio
26
+ from ghostscraper import GhostScraper
27
+
28
+ async def main():
29
+ # Create a scraper instance
30
+ scraper = GhostScraper(url="https://example.com")
31
+
32
+ # Get the HTML content
33
+ html = await scraper.html()
34
+
35
+ # Get the Markdown converted content
36
+ markdown = await scraper.markdown()
37
+
38
+ # Get the response code
39
+ status_code = await scraper.response_code()
40
+
41
+ print(f"Status code: {status_code}")
42
+ print(f"Markdown content:\n{markdown}")
43
+
44
+ # Run the async function
45
+ asyncio.run(main())
46
+ ```
47
+
48
+ ## API Reference
49
+
50
+ ### GhostScraper
51
+
52
+ The main class for scraping and converting web content.
53
+
54
+ #### Constructor
55
+
56
+ ```python
57
+ GhostScraper(
58
+ url: str = "",
59
+ clear_cache: bool = False,
60
+ markdown_options: Optional[Dict[str, Any]] = None,
61
+ **kwargs
62
+ )
63
+ ```
64
+
65
+ - `url`: The URL to scrape
66
+ - `clear_cache`: Whether to clear the cache before scraping
67
+ - `markdown_options`: Options for the Markdown converter
68
+ - `**kwargs`: Additional arguments passed to the PlaywrightScraper
69
+
70
+ #### Methods
71
+
72
+ - `async html() -> str`: Get the HTML content of the URL
73
+ - `async response_code() -> int`: Get the HTTP response code
74
+ - `async markdown() -> str`: Get the content converted to Markdown
75
+ - `async soup() -> BeautifulSoup`: Get a BeautifulSoup object for the HTML content
76
+
77
+ ### **kwargs Keywords
78
+
79
+ The GhostScraper constructor accepts any keyword arguments and passes them directly to the underlying PlaywrightScraper. This allows you to customize the browser behavior without directly interacting with the PlaywrightScraper class.
80
+
81
+ ```python
82
+ # GhostScraper accepts all these keyword arguments which are passed to PlaywrightScraper
83
+ scraper = GhostScraper(
84
+ url="https://example.com",
85
+ browser_type="chromium", # Browser to use: "chromium", "firefox", or "webkit"
86
+ headless=True, # Run browser in headless mode
87
+ browser_args={}, # Arguments for browser launcher
88
+ context_args={}, # Arguments for browser context
89
+ max_retries=3, # Maximum retry attempts
90
+ backoff_factor=2.0, # Exponential backoff factor
91
+ network_idle_timeout=10000, # Network idle timeout (ms)
92
+ load_timeout=30000, # Page load timeout (ms)
93
+ wait_for_selectors=[] # CSS selectors to wait for
94
+ )
95
+ ```
96
+
97
+ These keyword arguments configure how the page is loaded, browser behavior, and retry mechanisms.
98
+
99
+ ## Advanced Usage
100
+
101
+ ### Custom Markdown Options
102
+
103
+ ```python
104
+ from ghostscraper import GhostScraper
105
+
106
+ # Configure the Markdown converter
107
+ markdown_options = {
108
+ "strip_tags": ["script", "style", "nav", "footer", "header", "aside"],
109
+ "keep_tags": ["article", "main", "div", "section", "p"],
110
+ "content_selectors": ["article", "main", ".content", "#content"],
111
+ "preserve_images": True,
112
+ "preserve_links": True,
113
+ "preserve_tables": True,
114
+ "include_title": True,
115
+ "compact_output": False
116
+ }
117
+
118
+ # Create a scraper with custom Markdown options
119
+ scraper = GhostScraper(
120
+ url="https://example.com",
121
+ markdown_options=markdown_options
122
+ )
123
+ ```
124
+
125
+ ### Custom Browser Configuration
126
+
127
+ ```python
128
+ from ghostscraper import GhostScraper
129
+
130
+ # Create a scraper with custom browser settings
131
+ scraper = GhostScraper(
132
+ url="https://example.com",
133
+ # Browser configuration options (passed to PlaywrightScraper)
134
+ browser_type="firefox", # Use Firefox instead of Chromium
135
+ headless=False, # Show the browser window
136
+ max_retries=5, # Increase retry attempts
137
+ load_timeout=60000, # Increase load timeout to 60 seconds
138
+ wait_for_selectors=[".content", ".main-article"] # Wait for these selectors
139
+ )
140
+
141
+ # You can also pass browser-specific arguments
142
+ scraper = GhostScraper(
143
+ url="https://example.com",
144
+ browser_args={
145
+ "proxy": { # Set up a proxy
146
+ "server": "http://myproxy.com:8080",
147
+ "username": "user",
148
+ "password": "pass"
149
+ },
150
+ "slowMo": 50, # Slow down browser operations by 50ms
151
+ },
152
+ context_args={
153
+ "userAgent": "Custom User Agent", # Set custom user agent
154
+ "viewport": {"width": 1920, "height": 1080} # Set viewport size
155
+ }
156
+ )
157
+ ```
158
+
159
+ ### Progressive Loading Strategy
160
+
161
+ GhostScraper uses a progressive loading strategy that tries different methods to load the page:
162
+
163
+ 1. First tries with `networkidle` - waits until network is idle
164
+ 2. If that fails, tries with `load` - waits for the load event
165
+ 3. If that fails, tries with `domcontentloaded` - waits for DOM content loaded
166
+
167
+ This ensures maximum compatibility with different websites.
168
+
169
+ ### Browser Installation
170
+
171
+ GhostScraper automatically checks if the required browser is installed and installs it if needed:
172
+
173
+ ```python
174
+ # Install browsers manually if needed
175
+ from ghostscraper import install_browser
176
+
177
+ # Install a specific browser type
178
+ install_browser("chromium")
179
+ install_browser("firefox")
180
+ install_browser("webkit")
181
+ ```
182
+
183
+ ### Using Caching
184
+
185
+ By default, GhostScraper caches results in the `data/ghostscraper` directory. To clear the cache:
186
+
187
+ ```python
188
+ # Clear cache for a specific URL
189
+ scraper = GhostScraper(url="https://example.com", clear_cache=True)
190
+ ```
191
+
192
+ ## License
193
+
194
+ MIT
@@ -0,0 +1,12 @@
1
+ {
2
+ "_json_cache_func_cache": {},
3
+ "_json_cache_variable_cache": {
4
+ "_html": "<!DOCTYPE html><html><head>\n <title>Example Domain</title>\n\n <meta charset=\"utf-8\">\n <meta http-equiv=\"Content-type\" content=\"text/html; charset=utf-8\">\n <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\">\n <style type=\"text/css\">\n body {\n background-color: #f0f0f2;\n margin: 0;\n padding: 0;\n font-family: -apple-system, system-ui, BlinkMacSystemFont, \"Segoe UI\", \"Open Sans\", \"Helvetica Neue\", Helvetica, Arial, sans-serif;\n \n }\n div {\n width: 600px;\n margin: 5em auto;\n padding: 2em;\n background-color: #fdfdff;\n border-radius: 0.5em;\n box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n }\n a:link, a:visited {\n color: #38488f;\n text-decoration: none;\n }\n @media (max-width: 700px) {\n div {\n margin: 0 auto;\n width: auto;\n }\n }\n </style> \n</head>\n\n<body>\n<div>\n <h1>Example Domain</h1>\n <p>This domain is for use in illustrative examples in documents. You may use this\n domain in literature without prior coordination or asking for permission.</p>\n <p><a href=\"https://www.iana.org/domains/example\">More information...</a></p>\n</div>\n\n\n</body></html>",
5
+ "_markdown": null,
6
+ "_markdown_options": {},
7
+ "_response_code": 200,
8
+ "_soup": null,
9
+ "kwargs": {},
10
+ "url": "https://www.example.com"
11
+ }
12
+ }
@@ -0,0 +1,4 @@
1
+ from .playwright_scraper import PlaywrightScraper
2
+ from .markdown_converter import MarkdownConverter
3
+ from .ghost_scraper import GhostScraper
4
+ from .playwright_installer import check_browser_installed, install_browser
@@ -0,0 +1,59 @@
1
+ from logorator import Logger
2
+ from typing import Any, Dict, Optional
3
+
4
+ from bs4 import BeautifulSoup
5
+ from cacherator import Cached, JSONCache
6
+ from slugify import slugify
7
+
8
+ from .markdown_converter import MarkdownConverter
9
+ from .playwright_scraper import PlaywrightScraper
10
+
11
+
12
+ class GhostScraper(JSONCache):
13
+ def __init__(self, url="", clear_cache=False, ttl=999,markdown_options: Optional[Dict[str, Any]] = None, **kwargs):
14
+ self.url = url
15
+ self._html: str | None = None
16
+ self._soup: BeautifulSoup | None = None
17
+ self._markdown: str | None = None
18
+ self._response_code: int | None = None
19
+ self.kwargs = kwargs
20
+ self._markdown_options = markdown_options or {}
21
+
22
+ JSONCache.__init__(self, data_id=f"{slugify(self.url)}", directory="data/ghostscraper", clear_cache=clear_cache, ttl=ttl)
23
+
24
+ def __str__(self):
25
+ return f"{self.url}"
26
+
27
+ def __repr__(self):
28
+ return self.__str__()
29
+
30
+ @property
31
+ @Cached()
32
+ def _playwright_scraper(self):
33
+ return PlaywrightScraper(url=self.url, **self.kwargs)
34
+
35
+ @Logger(override_function_name="Fetching URL via Playwright")
36
+ async def _fetch_response(self):
37
+ return await self._playwright_scraper.fetch_and_close()
38
+
39
+ async def get_response(self):
40
+ if self._response_code is None or self._html is None:
41
+ (self._html, self._response_code) = await self._fetch_response()
42
+ return {"html": self._html, "response_code": self._response_code}
43
+
44
+ async def html(self):
45
+ return (await self.get_response())["html"]
46
+
47
+ async def response_code(self):
48
+ return (await self.get_response())["response_code"]
49
+
50
+ async def markdown(self) -> str:
51
+ if self._markdown is None:
52
+ converter = MarkdownConverter(**self._markdown_options)
53
+ self._markdown = converter.convert(await self.html())
54
+ return self._markdown
55
+
56
+ async def soup(self) -> BeautifulSoup:
57
+ if self._soup is None:
58
+ self._soup = BeautifulSoup(await self.html(), "html.parser")
59
+ return self._soup
@@ -0,0 +1,277 @@
1
+ from typing import Optional, Dict, Any, List, Union, Tuple, Set
2
+ from bs4 import BeautifulSoup, Tag, NavigableString
3
+ import re
4
+
5
+
6
+ class MarkdownConverter:
7
+ def __init__(
8
+ self,
9
+ strip_tags: Optional[List[str]] = None,
10
+ keep_tags: Optional[List[str]] = None,
11
+ content_selectors: Optional[List[str]] = None,
12
+ preserve_images: bool = True,
13
+ preserve_links: bool = True,
14
+ preserve_tables: bool = True,
15
+ include_title: bool = True,
16
+ compact_output: bool = False
17
+ ):
18
+ self.strip_tags = strip_tags or ["script", "style", "nav", "footer", "header", "aside", "iframe", "noscript"]
19
+ self.keep_tags = keep_tags or ["article", "main", "div", "section", "p", "h1", "h2", "h3", "h4", "h5", "h6"]
20
+ self.content_selectors = content_selectors or [
21
+ "article", "main", ".content", "#content", ".post-content",
22
+ ".article-content", ".entry-content", "[role='main']"
23
+ ]
24
+ self.preserve_images = preserve_images
25
+ self.preserve_links = preserve_links
26
+ self.preserve_tables = preserve_tables
27
+ self.include_title = include_title
28
+ self.compact_output = compact_output
29
+
30
+ def _extract_title(self, soup: BeautifulSoup) -> str:
31
+ title_tag = soup.title
32
+ if title_tag:
33
+ return title_tag.string.strip()
34
+ h1_tag = soup.find("h1")
35
+ if h1_tag:
36
+ return h1_tag.get_text().strip()
37
+ return ""
38
+
39
+ def _clean_text(self, text: str) -> str:
40
+ text = re.sub(r'\s+', ' ', text).strip()
41
+ text = re.sub(r'\n\s*\n', '\n\n', text)
42
+ return text
43
+
44
+ def _handle_heading(self, tag: Tag, level: int) -> str:
45
+ text = tag.get_text().strip()
46
+ return f"{'#' * level} {text}\n\n"
47
+
48
+ def _handle_paragraph(self, tag: Tag) -> str:
49
+ text = tag.get_text().strip()
50
+ if not text:
51
+ return ""
52
+ return f"{text}\n\n"
53
+
54
+ def _handle_list(self, tag: Tag, ordered: bool = False) -> str:
55
+ result = []
56
+ for i, item in enumerate(tag.find_all("li", recursive=False)):
57
+ prefix = f"{i + 1}. " if ordered else "* "
58
+ text = item.get_text().strip()
59
+ result.append(f"{prefix}{text}")
60
+ return "\n".join(result) + "\n\n"
61
+
62
+ def _handle_link(self, tag: Tag) -> str:
63
+ if not self.preserve_links:
64
+ return tag.get_text().strip()
65
+
66
+ text = tag.get_text().strip()
67
+ href = tag.get("href", "")
68
+ title = tag.get("title", "")
69
+
70
+ if not href or not text:
71
+ return text
72
+
73
+ if title:
74
+ return f"[{text}]({href} \"{title}\")"
75
+ return f"[{text}]({href})"
76
+
77
+ def _handle_image(self, tag: Tag) -> str:
78
+ if not self.preserve_images:
79
+ return ""
80
+
81
+ alt = tag.get("alt", "")
82
+ src = tag.get("src", "")
83
+ title = tag.get("title", "")
84
+
85
+ if not src:
86
+ return ""
87
+
88
+ if src.startswith("/"):
89
+ parent_link = tag.find_parent("a")
90
+ if parent_link and parent_link.get("href"):
91
+ href = parent_link.get("href", "")
92
+ if href.startswith("http"):
93
+ base = href.split("//")[0] + "//" + href.split("//")[1].split("/")[0]
94
+ src = base + src
95
+
96
+ if title:
97
+ return f"![{alt}]({src} \"{title}\")"
98
+ return f"![{alt}]({src})"
99
+
100
+ def _handle_table(self, tag: Tag) -> str:
101
+ if not self.preserve_tables:
102
+ return tag.get_text().strip() + "\n\n"
103
+
104
+ result = []
105
+
106
+ headers = []
107
+ header_row = tag.find("thead")
108
+ if header_row:
109
+ for th in header_row.find_all("th"):
110
+ headers.append(th.get_text().strip())
111
+
112
+ if not headers and tag.find("tr"):
113
+ first_row = tag.find("tr")
114
+ for cell in first_row.find_all(["th", "td"]):
115
+ headers.append(cell.get_text().strip())
116
+
117
+ if not headers:
118
+ first_row = tag.find("tr")
119
+ if first_row:
120
+ cell_count = len(first_row.find_all(["td", "th"]))
121
+ headers = [f"Column {i + 1}" for i in range(cell_count)]
122
+ else:
123
+ return tag.get_text().strip() + "\n\n"
124
+
125
+ result.append("| " + " | ".join(headers) + " |")
126
+ result.append("| " + " | ".join(["---"] * len(headers)) + " |")
127
+
128
+ body = tag.find("tbody") or tag
129
+ for row in body.find_all("tr"):
130
+ if not header_row and row == tag.find("tr"):
131
+ continue
132
+
133
+ cells = []
134
+ row_cells = row.find_all(["td", "th"])
135
+
136
+ for cell in row_cells:
137
+ content = cell.get_text().strip()
138
+ colspan = int(cell.get("colspan", 1))
139
+ if colspan > 1:
140
+ cells.extend([content] + [""] * (colspan - 1))
141
+ else:
142
+ cells.append(content)
143
+
144
+ while len(cells) < len(headers):
145
+ cells.append("")
146
+
147
+ cells = cells[:len(headers)]
148
+
149
+ if cells:
150
+ result.append("| " + " | ".join(cells) + " |")
151
+
152
+ return "\n".join(result) + "\n\n"
153
+
154
+ def _handle_blockquote(self, tag: Tag) -> str:
155
+ lines = tag.get_text().strip().split("\n")
156
+ result = []
157
+ for line in lines:
158
+ result.append(f"> {line}")
159
+ return "\n".join(result) + "\n\n"
160
+
161
+ def _handle_code(self, tag: Tag) -> str:
162
+ language = tag.get("class", [""])[0].replace("language-", "") if tag.get("class") else ""
163
+ code = tag.get_text()
164
+ if language:
165
+ return f"```{language}\n{code}\n```\n\n"
166
+ return f"```\n{code}\n```\n\n"
167
+
168
+ def _handle_inline_code(self, tag: Tag) -> str:
169
+ return f"`{tag.get_text()}`"
170
+
171
+ def _handle_strong(self, tag: Tag) -> str:
172
+ return f"**{tag.get_text()}**"
173
+
174
+ def _handle_em(self, tag: Tag) -> str:
175
+ return f"*{tag.get_text()}*"
176
+
177
+ def _handle_hr(self, tag: Tag) -> str:
178
+ return "---\n\n"
179
+
180
+ def _process_node(self, node: Union[Tag, NavigableString]) -> str:
181
+ if isinstance(node, NavigableString):
182
+ return str(node)
183
+
184
+ tag_name = node.name
185
+
186
+ if tag_name in self.strip_tags:
187
+ return ""
188
+
189
+ handlers = {
190
+ "h1" : lambda t: self._handle_heading(t, 1),
191
+ "h2" : lambda t: self._handle_heading(t, 2),
192
+ "h3" : lambda t: self._handle_heading(t, 3),
193
+ "h4" : lambda t: self._handle_heading(t, 4),
194
+ "h5" : lambda t: self._handle_heading(t, 5),
195
+ "h6" : lambda t: self._handle_heading(t, 6),
196
+ "p" : self._handle_paragraph,
197
+ "ul" : lambda t: self._handle_list(t, ordered=False),
198
+ "ol" : lambda t: self._handle_list(t, ordered=True),
199
+ "a" : self._handle_link,
200
+ "img" : self._handle_image,
201
+ "table" : self._handle_table,
202
+ "blockquote": self._handle_blockquote,
203
+ "pre" : self._handle_code,
204
+ "code" : self._handle_inline_code,
205
+ "strong" : self._handle_strong,
206
+ "b" : self._handle_strong,
207
+ "em" : self._handle_em,
208
+ "i" : self._handle_em,
209
+ "hr" : self._handle_hr,
210
+ }
211
+
212
+ if tag_name in handlers:
213
+ return handlers[tag_name](node)
214
+
215
+ result = ""
216
+ for child in node.children:
217
+ result += self._process_node(child)
218
+
219
+ return result
220
+
221
+ def _find_content_container(self, soup: BeautifulSoup) -> Optional[Tag]:
222
+ for selector in self.content_selectors:
223
+ if selector.startswith("."):
224
+ containers = soup.find_all(class_=selector[1:])
225
+ elif selector.startswith("#"):
226
+ container = soup.find(id=selector[1:])
227
+ containers = [container] if container else []
228
+ elif "[" in selector and "]" in selector:
229
+ attr_name = selector.split("[")[1].split("=")[0]
230
+ attr_value = selector.split("=")[1].split("]")[0].strip("'\"")
231
+ containers = soup.find_all(attrs={attr_name: attr_value})
232
+ else:
233
+ containers = soup.find_all(selector)
234
+
235
+ if containers:
236
+ if len(containers) == 1:
237
+ return containers[0]
238
+
239
+ containers_with_length = [(c, len(c.get_text())) for c in containers]
240
+ containers_with_length.sort(key=lambda x: x[1], reverse=True)
241
+ return containers_with_length[0][0]
242
+
243
+ for tag_name in self.keep_tags:
244
+ tags = soup.find_all(tag_name)
245
+
246
+ if tags:
247
+ tags_with_length = [(tag, len(tag.get_text())) for tag in tags]
248
+ tags_with_length.sort(key=lambda x: x[1], reverse=True)
249
+ return tags_with_length[0][0]
250
+
251
+ return soup.body
252
+
253
+ def convert(self, html: str) -> str:
254
+ if not html:
255
+ return ""
256
+
257
+ soup = BeautifulSoup(html, "html.parser")
258
+
259
+ for tag_name in self.strip_tags:
260
+ for tag in soup.find_all(tag_name):
261
+ tag.decompose()
262
+
263
+ content = self._find_content_container(soup)
264
+ if not content:
265
+ content = soup
266
+
267
+ title = self._extract_title(soup) if self.include_title else ""
268
+ result = f"# {title}\n\n" if title else ""
269
+
270
+ markdown = result + self._process_node(content)
271
+
272
+ markdown = re.sub(r'\n{3,}', '\n\n', markdown)
273
+
274
+ if self.compact_output:
275
+ markdown = re.sub(r'\n\n+', '\n\n', markdown)
276
+
277
+ return markdown.strip()
@@ -0,0 +1,51 @@
1
+ import subprocess
2
+ import sys
3
+ import os
4
+ from playwright.async_api import async_playwright, Browser, BrowserContext
5
+ from logorator import Logger
6
+
7
+ async def check_browser_installed(browser_name: str) -> bool:
8
+ async with async_playwright() as p:
9
+ browsers = {"chromium": p.chromium, "firefox": p.firefox, "webkit": p.webkit, }
10
+ if browser_name not in browsers:
11
+ Logger.note(f"❌ Invalid browser name: {browser_name}")
12
+ return False
13
+
14
+ try:
15
+ browser = await browsers[browser_name].launch()
16
+ await browser.close()
17
+ Logger.note(f"✅ {browser_name} is installed and working!")
18
+ return True
19
+ except Exception as e:
20
+ Logger.note(f"❌ {browser_name} is NOT installed or failed to launch: {e}")
21
+ return False
22
+
23
+ @Logger()
24
+ def install_browser(browser_type: str) -> bool:
25
+ try:
26
+ Logger.note(f"\n[Ghostscraper] Installing {browser_type} browser (first-time setup)")
27
+ Logger.note("[Ghostscraper] This may take a few minutes...")
28
+
29
+ subprocess.check_call([
30
+ sys.executable, "-m", "playwright", "install", browser_type
31
+ ])
32
+
33
+ Logger.note(f"[Ghostscraper] Successfully installed {browser_type} browser.")
34
+ return True
35
+
36
+ except subprocess.CalledProcessError as e:
37
+ Logger.note(f"\n[Ghostscraper] Failed to install {browser_type} browser. Error code: {e.returncode}")
38
+
39
+ if os.name == 'posix' and os.geteuid() != 0:
40
+ Logger.note("[Ghostscraper] You may need to run with sudo privileges.")
41
+ Logger.note(f"[Ghostscraper] Try: sudo playwright install {browser_type}")
42
+ else:
43
+ Logger.note("[Ghostscraper] You may need administrator privileges.")
44
+ Logger.note(f"[Ghostscraper] Try running: playwright install {browser_type}")
45
+
46
+ return False
47
+
48
+ except Exception as e:
49
+ Logger.note(f"\n[Ghostscraper] An unexpected error occurred: {str(e)}")
50
+ Logger.note(f"[Ghostscraper] Please run 'playwright install {browser_type}' manually.")
51
+ return False
@@ -0,0 +1,207 @@
1
+ import asyncio
2
+ from typing import Any, Dict, List, Literal, Optional, Tuple
3
+
4
+ from logorator import Logger
5
+ from playwright.async_api import async_playwright, Browser, BrowserContext, Page, Playwright, TimeoutError as PlaywrightTimeoutError
6
+
7
+ from .playwright_installer import check_browser_installed, install_browser
8
+
9
+
10
+ class PlaywrightScraper:
11
+ BROWSERS_CHECKED = {}
12
+
13
+ def __init__(self, url: str = "", browser_type: Literal["chromium", "firefox", "webkit"] = "chromium", headless: bool = True, browser_args: Optional[Dict[str, Any]] = None,
14
+ context_args: Optional[Dict[str, Any]] = None, max_retries: int = 3, backoff_factor: float = 2.0, network_idle_timeout: int = 10000, # 10 seconds
15
+ load_timeout: int = 30000, # 30 seconds
16
+ wait_for_selectors: Optional[List[str]] = None # CSS selectors to wait for
17
+ ):
18
+ self.url = url
19
+ self.browser_type: str = browser_type
20
+ self.headless: bool = headless
21
+ self.browser_args: Dict[str, Any] = browser_args or {}
22
+ self.context_args: Dict[str, Any] = context_args or {}
23
+ self.max_retries: int = max_retries
24
+ self.backoff_factor: float = backoff_factor
25
+ self.network_idle_timeout: int = network_idle_timeout
26
+ self.load_timeout: int = load_timeout
27
+ self.wait_for_selectors: List[str] = wait_for_selectors or []
28
+ self._playwright: Optional[Playwright] = None
29
+ self._browser: Optional[Browser] = None
30
+ self._context: Optional[BrowserContext] = None
31
+ self.last_status_code: int = 200
32
+
33
+ def __str__(self):
34
+ return self.url
35
+
36
+ def __repr__(self):
37
+ return self.__str__()
38
+
39
+ async def check_and_install_browser(self):
40
+ if PlaywrightScraper.BROWSERS_CHECKED.get(self.browser_type) is not None:
41
+ return PlaywrightScraper.BROWSERS_CHECKED.get(self.browser_type)
42
+ if await check_browser_installed(self.browser_type):
43
+ PlaywrightScraper.BROWSERS_CHECKED[self.browser_type] = True
44
+ return True
45
+ else:
46
+ install_browser(self.browser_type)
47
+ PlaywrightScraper.BROWSERS_CHECKED[self.browser_type] = asyncio.run(check_browser_installed(self.browser_type))
48
+ return PlaywrightScraper.BROWSERS_CHECKED[self.browser_type]
49
+
50
+ async def _ensure_browser(self) -> None:
51
+ await self.check_and_install_browser()
52
+ if self._playwright is None:
53
+ self._playwright = await async_playwright().start()
54
+
55
+ if self.browser_type == "chromium":
56
+ browser_launcher = self._playwright.chromium
57
+ elif self.browser_type == "firefox":
58
+ browser_launcher = self._playwright.firefox
59
+ elif self.browser_type == "webkit":
60
+ browser_launcher = self._playwright.webkit
61
+ else:
62
+ raise ValueError(f"Unknown browser type: {self.browser_type}")
63
+
64
+ self._browser = await browser_launcher.launch(headless=self.headless, **self.browser_args)
65
+
66
+ self._context = await self._browser.new_context(**self.context_args)
67
+
68
+ async def _try_progressive_load(self, page: Page, url: str) -> Tuple[bool, int]:
69
+ # Strategy 1: Try with networkidle first (strictest, but most reliable)
70
+ try:
71
+ Logger.note(f"GhostScraper: Attempting to load with 'networkidle' (timeout: {self.network_idle_timeout}ms)")
72
+ response = await page.goto(url, wait_until="networkidle", timeout=self.network_idle_timeout)
73
+ status_code = response.status if response else 200
74
+ return True, status_code
75
+ except PlaywrightTimeoutError:
76
+ Logger.note("GhostScraper: 'networkidle' timed out, falling back to 'load'")
77
+ pass
78
+
79
+ # Strategy 2: Fallback to load event (less strict)
80
+ try:
81
+ Logger.note(f"GhostScraper: Attempting to load with 'load' (timeout: {self.load_timeout}ms)")
82
+ response = await page.goto(url, wait_until="load", timeout=self.load_timeout)
83
+ status_code = response.status if response else 200
84
+ return True, status_code
85
+ except PlaywrightTimeoutError:
86
+ Logger.note("GhostScraper: 'load' timed out, falling back to 'domcontentloaded'")
87
+ pass
88
+
89
+ # Strategy 3: Fallback to domcontentloaded (least strict)
90
+ try:
91
+ Logger.note("GhostScraper: Attempting to load with 'domcontentloaded'")
92
+ response = await page.goto(url, wait_until="domcontentloaded", timeout=self.load_timeout)
93
+ status_code = response.status if response else 200
94
+ return True, status_code
95
+ except PlaywrightTimeoutError:
96
+ Logger.note("GhostScraper: All loading strategies failed")
97
+ return False, 408 # Request Timeout
98
+
99
+ async def _wait_for_selectors(self, page: Page) -> bool:
100
+ if not self.wait_for_selectors:
101
+ return True
102
+
103
+ try:
104
+ for selector in self.wait_for_selectors:
105
+ try:
106
+ Logger.note(f"GhostScraper: Waiting for selector '{selector}'")
107
+ await page.wait_for_selector(selector, timeout=5000)
108
+ Logger.note(f"GhostScraper: Found selector '{selector}'")
109
+ except PlaywrightTimeoutError:
110
+ Logger.note(f"GhostScraper: Selector '{selector}' not found, continuing anyway")
111
+ return True
112
+ except Exception as e:
113
+ Logger.note(f"GhostScraper: Error waiting for selectors: {str(e)}")
114
+ return False
115
+
116
+ async def fetch(self) -> Tuple[str, int]:
117
+ await self._ensure_browser()
118
+ attempts = 0
119
+
120
+ while attempts <= self.max_retries:
121
+ page: Page = await self._context.new_page()
122
+ try:
123
+ # Set a default navigation timeout
124
+ page.set_default_navigation_timeout(self.load_timeout)
125
+ # Try progressive loading strategies
126
+ load_success, status_code = await self._try_progressive_load(page, self.url)
127
+ self.last_status_code = status_code
128
+
129
+ if not load_success:
130
+ if attempts == self.max_retries:
131
+ Logger.note(f"GhostScraper: Max retries reached. All loading strategies failed.")
132
+ return "", 408
133
+ wait_time = self.backoff_factor ** attempts
134
+ Logger.note(f"GhostScraper: All loading strategies failed. Retrying in {wait_time:.2f}s (attempt {attempts + 1}/{self.max_retries})")
135
+ await asyncio.sleep(wait_time)
136
+ attempts += 1
137
+ continue
138
+
139
+ if status_code >= 400:
140
+ if attempts == self.max_retries:
141
+ Logger.note(f"GhostScraper: Max retries reached with status code {status_code}. Returning empty response.")
142
+ return "", status_code
143
+
144
+ wait_time = self.backoff_factor ** attempts
145
+ Logger.note(f"GhostScraper: Status code {status_code} received. Retrying in {wait_time:.2f}s (attempt {attempts + 1}/{self.max_retries})")
146
+ await asyncio.sleep(wait_time)
147
+ attempts += 1
148
+ continue
149
+
150
+ # Try to wait for specified selectors (if any)
151
+ await self._wait_for_selectors(page)
152
+
153
+ # If we reached here, we consider it a success. Grab the content and return.
154
+ html: str = await page.content()
155
+ return html, status_code
156
+
157
+ except PlaywrightTimeoutError as e:
158
+ if attempts == self.max_retries:
159
+ Logger.note(f"GhostScraper: Max retries reached after timeout. Returning empty response with 408 status.")
160
+ return "", 408
161
+
162
+ wait_time = self.backoff_factor ** attempts
163
+ Logger.note(f"GhostScraper: Timeout error occurred: {str(e)}. Retrying in {wait_time:.2f}s (attempt {attempts + 1}/{self.max_retries})")
164
+ await asyncio.sleep(wait_time)
165
+ attempts += 1
166
+
167
+ except Exception as e:
168
+ if attempts == self.max_retries:
169
+ Logger.note(f"GhostScraper: Max retries reached after exception: {str(e)}. Returning empty response with 500 status.")
170
+ return "", 500
171
+
172
+ wait_time = self.backoff_factor ** attempts
173
+ Logger.note(f"GhostScraper: Exception occurred: {str(e)}. Retrying in {wait_time:.2f}s (attempt {attempts + 1}/{self.max_retries})")
174
+ await asyncio.sleep(wait_time)
175
+ attempts += 1
176
+
177
+ finally:
178
+ await page.close()
179
+
180
+ # This should not be reached, but just in case
181
+ return "", 500
182
+
183
+ async def close(self) -> None:
184
+ if self._context:
185
+ await self._context.close()
186
+ self._context = None
187
+
188
+ if self._browser:
189
+ await self._browser.close()
190
+ self._browser = None
191
+
192
+ if self._playwright:
193
+ await self._playwright.stop()
194
+ self._playwright = None
195
+
196
+ async def fetch_and_close(self) -> Tuple[str, int]:
197
+ try:
198
+ return await self.fetch()
199
+ finally:
200
+ await self.close()
201
+
202
+ async def __aenter__(self):
203
+ await self._ensure_browser()
204
+ return self
205
+
206
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
207
+ await self.close()
@@ -0,0 +1,46 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "ghostscraper"
7
+ version = "0.0.1"
8
+ description = "An asynchronous web scraper using Playwright with HTML to Markdown conversion"
9
+ readme = "README.md"
10
+ authors = [
11
+ {name = "Arved Klöhn", email = "arved.kloehn@gmail.com"},
12
+ ]
13
+ license = {text = "MIT"}
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.8",
20
+ "Programming Language :: Python :: 3.9",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Topic :: Internet :: WWW/HTTP",
24
+ "Topic :: Software Development :: Libraries :: Python Modules",
25
+ "Topic :: Text Processing :: Markup :: HTML",
26
+ "Topic :: Text Processing :: Markup :: Markdown",
27
+ ]
28
+ keywords = ["web scraping", "playwright", "markdown", "html", "converter", "async"]
29
+ dependencies = [
30
+ "playwright>=1.30.0",
31
+ "beautifulsoup4>=4.10.0",
32
+ "cacherator",
33
+ "logorator",
34
+ "python-slugify>=8.0.0",
35
+ ]
36
+ requires-python = ">=3.8"
37
+
38
+ [project.urls]
39
+ Homepage = "https://github.com/Redundando/ghostscraper"
40
+ Issues = "https://github.com/Redundando/ghostscraper/issues"
41
+
42
+ [tool.hatch.build.targets.wheel]
43
+ packages = ["ghostscraper"]
44
+
45
+ [tool.hatch.build.targets.sdist]
46
+ include = ["ghostscraper"]