linktrace 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- WebCrawler/Crawler.py +396 -0
- WebCrawler/Serializers.py +165 -0
- WebCrawler/Spider.py +213 -0
- WebCrawler/__init__.py +17 -0
- WebCrawler/cache.py +109 -0
- WebCrawler/py.typed +0 -0
- WebCrawler/robots.py +117 -0
- linktrace-0.1.0.dist-info/METADATA +390 -0
- linktrace-0.1.0.dist-info/RECORD +11 -0
- linktrace-0.1.0.dist-info/WHEEL +4 -0
- linktrace-0.1.0.dist-info/licenses/LICENSE +21 -0
WebCrawler/Crawler.py
ADDED
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import ssl
|
|
4
|
+
from urllib.parse import urljoin, urlparse
|
|
5
|
+
|
|
6
|
+
import aiohttp
|
|
7
|
+
import lxml.etree
|
|
8
|
+
import lxml.html
|
|
9
|
+
import tldextract
|
|
10
|
+
|
|
11
|
+
from WebCrawler.cache import ResponseCache
|
|
12
|
+
from WebCrawler.robots import RobotsManager
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class HtmlLink:
|
|
16
|
+
def __init__(self, url: str, text: str) -> None:
|
|
17
|
+
self.url = url
|
|
18
|
+
self.text = text
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def schema(self) -> str:
|
|
22
|
+
return urlparse(self.url).scheme
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def description(self) -> str:
|
|
26
|
+
return self.text
|
|
27
|
+
|
|
28
|
+
def __repr__(self) -> str:
|
|
29
|
+
return f"htmllink(url={self.url!r})"
|
|
30
|
+
|
|
31
|
+
def __hash__(self) -> int:
|
|
32
|
+
return hash(self.url)
|
|
33
|
+
|
|
34
|
+
def __eq__(self, other: object) -> bool:
|
|
35
|
+
if isinstance(other, HtmlLink):
|
|
36
|
+
return self.url == other.url
|
|
37
|
+
return self.url == other
|
|
38
|
+
|
|
39
|
+
def __ne__(self, other: object) -> bool:
|
|
40
|
+
return not self.__eq__(other)
|
|
41
|
+
|
|
42
|
+
def __lt__(self, other: "HtmlLink | str") -> bool:
|
|
43
|
+
return self.url < (other.url if isinstance(other, HtmlLink) else other)
|
|
44
|
+
|
|
45
|
+
def __gt__(self, other: "HtmlLink | str") -> bool:
|
|
46
|
+
return self.url > (other.url if isinstance(other, HtmlLink) else other)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class Document:
|
|
50
|
+
def __init__(self, url: str, source: str | None) -> None:
|
|
51
|
+
self.url = url
|
|
52
|
+
self.source = source
|
|
53
|
+
self.title: str = ""
|
|
54
|
+
self.internal_links: list[HtmlLink] = []
|
|
55
|
+
self.external_links: list[HtmlLink] = []
|
|
56
|
+
self.links: list[HtmlLink] = []
|
|
57
|
+
self.broken_internal_links: list[BrokenLink] = []
|
|
58
|
+
self.broken_external_links: list[BrokenLink] = []
|
|
59
|
+
self.status_code: int = 0
|
|
60
|
+
self.response_headers: dict[str, str] = {}
|
|
61
|
+
self.dom: object = None
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def domain(self) -> str:
|
|
65
|
+
return tldextract.extract(self.url).domain
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class CrawlException(Exception):
|
|
69
|
+
def __init__(self, url: str, msg: str, **kw: object) -> None:
|
|
70
|
+
self.url = url
|
|
71
|
+
self.message = msg
|
|
72
|
+
super().__init__(url, msg, **kw)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class BrokenLink(HtmlLink):
|
|
76
|
+
def __init__(self, url: str, status: int) -> None:
|
|
77
|
+
self.status_code = status
|
|
78
|
+
super().__init__(url, str(status))
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class Crawler:
|
|
82
|
+
def __init__(
|
|
83
|
+
self,
|
|
84
|
+
log_level: int = logging.DEBUG,
|
|
85
|
+
log_name: str | None = None,
|
|
86
|
+
ssl_verify: bool | str = True,
|
|
87
|
+
verify_hostname: bool = True,
|
|
88
|
+
request_timeout: int = 30,
|
|
89
|
+
cache_dir: str | None = None,
|
|
90
|
+
max_retries: int = 3,
|
|
91
|
+
backoff_factor: int = 2,
|
|
92
|
+
request_delay: float = 0.0,
|
|
93
|
+
user_agent: str = "WebCrawler/0.1.0",
|
|
94
|
+
respect_robots_txt: bool = True,
|
|
95
|
+
) -> None:
|
|
96
|
+
self._logger = logging.getLogger(log_name if log_name else __name__)
|
|
97
|
+
self._logger.setLevel(log_level)
|
|
98
|
+
self.visited_urls: set[str] = set()
|
|
99
|
+
self._queue: list[str] = []
|
|
100
|
+
self._links: list[HtmlLink] = []
|
|
101
|
+
self._broken_links: list[BrokenLink] = []
|
|
102
|
+
|
|
103
|
+
# Session configuration
|
|
104
|
+
self.session: aiohttp.ClientSession | None = None
|
|
105
|
+
self.ssl_verify: bool | str = ssl_verify # bool or str(path to CA cert)
|
|
106
|
+
self.verify_hostname: bool = verify_hostname
|
|
107
|
+
self.request_timeout: int = request_timeout
|
|
108
|
+
self.max_retries: int = max_retries
|
|
109
|
+
self.backoff_factor: int = backoff_factor
|
|
110
|
+
|
|
111
|
+
# Rate limiting and robots.txt
|
|
112
|
+
self.request_delay: float = request_delay
|
|
113
|
+
self.user_agent: str = user_agent
|
|
114
|
+
self.respect_robots_txt: bool = respect_robots_txt
|
|
115
|
+
self.robots_manager: RobotsManager | None = None
|
|
116
|
+
self._domain_locks: dict[str, asyncio.Lock] = {}
|
|
117
|
+
self._last_request_time: dict[str, float] = {}
|
|
118
|
+
|
|
119
|
+
# Caching (opt-in)
|
|
120
|
+
self.cache: ResponseCache | None = (
|
|
121
|
+
ResponseCache(cache_dir) if cache_dir else None
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Cookies handled automatically via CookieJar in session (created lazily)
|
|
125
|
+
self._cookie_jar: aiohttp.CookieJar | None = None
|
|
126
|
+
|
|
127
|
+
async def __aenter__(self) -> "Crawler":
|
|
128
|
+
"""Enter async context manager: create and setup session."""
|
|
129
|
+
self.session = await self._create_session()
|
|
130
|
+
return self
|
|
131
|
+
|
|
132
|
+
async def __aexit__(
|
|
133
|
+
self,
|
|
134
|
+
exc_type: type[BaseException] | None,
|
|
135
|
+
exc_val: BaseException | None,
|
|
136
|
+
exc_tb: object,
|
|
137
|
+
) -> None:
|
|
138
|
+
"""Exit async context manager: cleanup session."""
|
|
139
|
+
if self.session:
|
|
140
|
+
await self.session.close()
|
|
141
|
+
|
|
142
|
+
async def _create_session(self) -> aiohttp.ClientSession:
|
|
143
|
+
"""Create persistent aiohttp session with SSL context, connector, timeouts."""
|
|
144
|
+
# Create cookie jar now (requires event loop)
|
|
145
|
+
if self._cookie_jar is None:
|
|
146
|
+
self._cookie_jar = aiohttp.CookieJar()
|
|
147
|
+
|
|
148
|
+
ssl_context = self._build_ssl_context()
|
|
149
|
+
|
|
150
|
+
connector = aiohttp.TCPConnector(
|
|
151
|
+
limit=100,
|
|
152
|
+
limit_per_host=10,
|
|
153
|
+
ttl_dns_cache=300,
|
|
154
|
+
ssl=ssl_context,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
session = aiohttp.ClientSession(
|
|
158
|
+
connector=connector,
|
|
159
|
+
cookie_jar=self._cookie_jar,
|
|
160
|
+
timeout=aiohttp.ClientTimeout(total=self.request_timeout),
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# Initialize RobotsManager if robots.txt respect enabled
|
|
164
|
+
if self.respect_robots_txt:
|
|
165
|
+
self.robots_manager = RobotsManager(self.user_agent, session)
|
|
166
|
+
|
|
167
|
+
return session
|
|
168
|
+
|
|
169
|
+
def _build_ssl_context(self) -> ssl.SSLContext:
|
|
170
|
+
"""Build SSL context with flexible verification options.
|
|
171
|
+
|
|
172
|
+
Supports:
|
|
173
|
+
- ssl_verify=True (default): verify certs with system CA bundle
|
|
174
|
+
- ssl_verify=False: disable verification (insecure, for testing)
|
|
175
|
+
- ssl_verify="/path/to/ca.pem": verify with custom CA bundle (corporate proxies)
|
|
176
|
+
"""
|
|
177
|
+
context = ssl.create_default_context()
|
|
178
|
+
|
|
179
|
+
if self.ssl_verify is False:
|
|
180
|
+
# Completely disable verification (INSECURE)
|
|
181
|
+
context.check_hostname = False
|
|
182
|
+
context.verify_mode = ssl.CERT_NONE
|
|
183
|
+
self._logger.warning(
|
|
184
|
+
"SSL certificate verification disabled. "
|
|
185
|
+
"This is insecure and should only be used for testing."
|
|
186
|
+
)
|
|
187
|
+
elif isinstance(self.ssl_verify, str):
|
|
188
|
+
# Load custom CA bundle (corporate proxy scenario)
|
|
189
|
+
try:
|
|
190
|
+
context.load_verify_locations(self.ssl_verify)
|
|
191
|
+
self._logger.debug(f"Loaded custom CA bundle: {self.ssl_verify}")
|
|
192
|
+
except FileNotFoundError:
|
|
193
|
+
self._logger.error(f"CA bundle not found: {self.ssl_verify}")
|
|
194
|
+
raise
|
|
195
|
+
except ssl.SSLError as e:
|
|
196
|
+
self._logger.error(f"Error loading CA bundle: {e}")
|
|
197
|
+
raise
|
|
198
|
+
# else: ssl_verify=True, use defaults (CERT_REQUIRED + system CA bundle)
|
|
199
|
+
|
|
200
|
+
# Optionally disable hostname checking (independent of cert verification)
|
|
201
|
+
if not self.verify_hostname:
|
|
202
|
+
context.check_hostname = False
|
|
203
|
+
self._logger.warning(
|
|
204
|
+
"Hostname verification disabled. "
|
|
205
|
+
"This weakens security and should only be used for testing."
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
return context
|
|
209
|
+
|
|
210
|
+
async def _rate_limit_domain(self, url: str, delay: float) -> None:
|
|
211
|
+
"""Enforce per-domain rate limiting using asyncio.Lock.
|
|
212
|
+
|
|
213
|
+
Ensures delay seconds pass between requests to same domain.
|
|
214
|
+
"""
|
|
215
|
+
import time
|
|
216
|
+
|
|
217
|
+
domain = urlparse(url).netloc
|
|
218
|
+
|
|
219
|
+
# Get or create lock for domain
|
|
220
|
+
if domain not in self._domain_locks:
|
|
221
|
+
self._domain_locks[domain] = asyncio.Lock()
|
|
222
|
+
|
|
223
|
+
async with self._domain_locks[domain]:
|
|
224
|
+
now = time.time()
|
|
225
|
+
last_time = self._last_request_time.get(domain, 0.0)
|
|
226
|
+
time_since_last = now - last_time
|
|
227
|
+
|
|
228
|
+
if time_since_last < delay:
|
|
229
|
+
wait_time = delay - time_since_last
|
|
230
|
+
self._logger.debug(
|
|
231
|
+
f"Rate limit: waiting {wait_time:.2f}s before {domain}"
|
|
232
|
+
)
|
|
233
|
+
await asyncio.sleep(wait_time)
|
|
234
|
+
|
|
235
|
+
self._last_request_time[domain] = time.time()
|
|
236
|
+
|
|
237
|
+
async def crawl_document_async(self, url: str) -> Document | None:
|
|
238
|
+
"""Fetch and parse a document with retries, with optional caching."""
|
|
239
|
+
if not self.session:
|
|
240
|
+
raise RuntimeError(
|
|
241
|
+
"Crawler.session not initialized. "
|
|
242
|
+
"Use 'async with Crawler(...) as crawler:' context manager."
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
# Check cache first
|
|
246
|
+
if self.cache:
|
|
247
|
+
cached = await self.cache.get(url)
|
|
248
|
+
if cached:
|
|
249
|
+
self._logger.debug(f"Cache hit for {url}")
|
|
250
|
+
doc = self.parse_document(url, cached.content)
|
|
251
|
+
doc.status_code = cached.status_code
|
|
252
|
+
doc.response_headers = cached.response_headers
|
|
253
|
+
return doc
|
|
254
|
+
|
|
255
|
+
# Get effective delay (robots.txt or configured)
|
|
256
|
+
if self.respect_robots_txt and self.robots_manager:
|
|
257
|
+
delay = await self.robots_manager.get_crawl_delay(url)
|
|
258
|
+
else:
|
|
259
|
+
delay = self.request_delay
|
|
260
|
+
|
|
261
|
+
# Enforce per-domain rate limiting
|
|
262
|
+
await self._rate_limit_domain(url, delay)
|
|
263
|
+
|
|
264
|
+
# Implement retry logic
|
|
265
|
+
for attempt in range(self.max_retries):
|
|
266
|
+
try:
|
|
267
|
+
headers = {"User-Agent": self.user_agent}
|
|
268
|
+
async with self.session.get(url, headers=headers) as response:
|
|
269
|
+
self._logger.debug(f"Fetching {url} (attempt {attempt + 1})")
|
|
270
|
+
status = response.status
|
|
271
|
+
response_headers = dict(response.headers)
|
|
272
|
+
|
|
273
|
+
if status != 200:
|
|
274
|
+
self._logger.error(f"Failed to fetch {url}: HTTP {status}")
|
|
275
|
+
doc = Document(url, None)
|
|
276
|
+
doc.status_code = status
|
|
277
|
+
doc.response_headers = response_headers
|
|
278
|
+
return doc
|
|
279
|
+
|
|
280
|
+
html = await response.text()
|
|
281
|
+
|
|
282
|
+
doc = self.parse_document(url, html)
|
|
283
|
+
doc.status_code = status
|
|
284
|
+
doc.response_headers = response_headers
|
|
285
|
+
|
|
286
|
+
# Cache successful responses
|
|
287
|
+
if self.cache:
|
|
288
|
+
await self.cache.set(url, status, response_headers, html)
|
|
289
|
+
|
|
290
|
+
return doc
|
|
291
|
+
|
|
292
|
+
except (
|
|
293
|
+
TimeoutError,
|
|
294
|
+
aiohttp.ClientConnectorError,
|
|
295
|
+
aiohttp.ServerTimeoutError,
|
|
296
|
+
) as e:
|
|
297
|
+
if attempt < self.max_retries - 1:
|
|
298
|
+
wait_time = 2**attempt * self.backoff_factor
|
|
299
|
+
self._logger.warning(
|
|
300
|
+
f"Transient error fetching {url} "
|
|
301
|
+
f"(attempt {attempt + 1}/{self.max_retries}): {e}. "
|
|
302
|
+
f"Retrying in {wait_time}s..."
|
|
303
|
+
)
|
|
304
|
+
await asyncio.sleep(wait_time)
|
|
305
|
+
else:
|
|
306
|
+
self._logger.error(
|
|
307
|
+
f"Failed to fetch {url} after {self.max_retries} attempts: {e}"
|
|
308
|
+
)
|
|
309
|
+
except aiohttp.ClientError as e:
|
|
310
|
+
self._logger.error(f"Client error while fetching {url}: {e}")
|
|
311
|
+
break
|
|
312
|
+
except Exception as e:
|
|
313
|
+
self._logger.error(f"Unexpected error while fetching {url}: {e}")
|
|
314
|
+
raise
|
|
315
|
+
|
|
316
|
+
return None
|
|
317
|
+
|
|
318
|
+
def parse_document(self, url: str, source: str | None) -> Document:
|
|
319
|
+
self._protocol_ = urlparse(url).scheme if not urlparse(url).scheme else "http"
|
|
320
|
+
self.base_uri = urlparse(url).netloc
|
|
321
|
+
|
|
322
|
+
doc = Document(url, source)
|
|
323
|
+
if source is None:
|
|
324
|
+
return doc
|
|
325
|
+
|
|
326
|
+
try:
|
|
327
|
+
dom = lxml.html.fromstring(source)
|
|
328
|
+
if url not in self.visited_urls:
|
|
329
|
+
self.visited_urls.add(url)
|
|
330
|
+
|
|
331
|
+
doc.dom = dom
|
|
332
|
+
|
|
333
|
+
title_element = dom.xpath("//title/text()")
|
|
334
|
+
if title_element:
|
|
335
|
+
doc.title = title_element[0].strip()
|
|
336
|
+
|
|
337
|
+
skip_words = ["#", "\r", "\n", " ", "&"]
|
|
338
|
+
links = [
|
|
339
|
+
link
|
|
340
|
+
for link in dom.xpath("//a/@href/..")
|
|
341
|
+
if link.attrib["href"] and link.attrib["href"][:1] not in skip_words
|
|
342
|
+
]
|
|
343
|
+
|
|
344
|
+
for link in links:
|
|
345
|
+
link_url = link.attrib["href"]
|
|
346
|
+
link_url = urljoin(url, link_url)
|
|
347
|
+
if urlparse(link_url).scheme not in ["http", "https", "ftp"]:
|
|
348
|
+
continue
|
|
349
|
+
|
|
350
|
+
if "javascript:" not in link_url:
|
|
351
|
+
title = "".join(link.xpath("./text()")).strip()
|
|
352
|
+
if (
|
|
353
|
+
link_url not in self.visited_urls
|
|
354
|
+
and link_url not in self._queue
|
|
355
|
+
):
|
|
356
|
+
self._links.append(HtmlLink(link_url, title))
|
|
357
|
+
|
|
358
|
+
doc.internal_links = [
|
|
359
|
+
link
|
|
360
|
+
for link in self._links
|
|
361
|
+
if self.get_domain_parts(link.url) == self.get_domain_parts(doc.url)
|
|
362
|
+
or link.url[:1] == "/"
|
|
363
|
+
or link.url[:1] == ""
|
|
364
|
+
]
|
|
365
|
+
doc.internal_links = list(dict.fromkeys(doc.internal_links))
|
|
366
|
+
|
|
367
|
+
doc.external_links = [
|
|
368
|
+
link
|
|
369
|
+
for link in self._links
|
|
370
|
+
if self.get_domain_parts(link.url) != self.get_domain_parts(doc.url)
|
|
371
|
+
and urlparse(link.url).scheme != ""
|
|
372
|
+
]
|
|
373
|
+
doc.external_links = list(dict.fromkeys(doc.external_links))
|
|
374
|
+
|
|
375
|
+
doc.links = doc.internal_links + doc.external_links
|
|
376
|
+
|
|
377
|
+
for link in doc.links:
|
|
378
|
+
self.queue_link(link.url)
|
|
379
|
+
except lxml.etree.XMLSyntaxError:
|
|
380
|
+
self._logger.error(
|
|
381
|
+
f"XMLSyntaxError: Invalid source document or truncated source at {url}"
|
|
382
|
+
)
|
|
383
|
+
self._broken_links.append(BrokenLink(url, 0))
|
|
384
|
+
|
|
385
|
+
return doc
|
|
386
|
+
|
|
387
|
+
def relative_to_full(self, url: str) -> str:
|
|
388
|
+
return urljoin(f"{self._protocol_}://{self.base_uri}", url)
|
|
389
|
+
|
|
390
|
+
def queue_link(self, link: str) -> None:
|
|
391
|
+
if link not in self.visited_urls and link not in self._queue:
|
|
392
|
+
self._queue.append(link)
|
|
393
|
+
|
|
394
|
+
@staticmethod
|
|
395
|
+
def get_domain_parts(url: str) -> str:
|
|
396
|
+
return urlparse(url).netloc
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""Export crawled documents to various formats: JSON, Pandas, Polars, PyArrow."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from WebCrawler.Crawler import Document
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Serializers:
|
|
10
|
+
"""Export documents with flattened links and rich metadata."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, documents: list[Document]) -> None:
|
|
13
|
+
"""Initialize with a list of crawled documents.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
documents: List of Document objects from Spider.run_async()
|
|
17
|
+
"""
|
|
18
|
+
self.documents = documents
|
|
19
|
+
|
|
20
|
+
def to_json(self, output_path: str, include_html: bool = False) -> None:
|
|
21
|
+
"""Export documents to JSON file with nested link structure.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
output_path: Path to write JSON file
|
|
25
|
+
include_html: Include raw HTML source in output (default False)
|
|
26
|
+
"""
|
|
27
|
+
data = []
|
|
28
|
+
for doc in self.documents:
|
|
29
|
+
doc_data = {
|
|
30
|
+
"url": doc.url,
|
|
31
|
+
"title": doc.title,
|
|
32
|
+
"status_code": doc.status_code,
|
|
33
|
+
"domain": doc.domain,
|
|
34
|
+
"response_headers": doc.response_headers,
|
|
35
|
+
"internal_links": [
|
|
36
|
+
{"url": link.url, "text": link.text} for link in doc.internal_links
|
|
37
|
+
],
|
|
38
|
+
"external_links": [
|
|
39
|
+
{"url": link.url, "text": link.text} for link in doc.external_links
|
|
40
|
+
],
|
|
41
|
+
}
|
|
42
|
+
if include_html and doc.source:
|
|
43
|
+
doc_data["html"] = doc.source
|
|
44
|
+
data.append(doc_data)
|
|
45
|
+
|
|
46
|
+
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
|
47
|
+
with open(output_path, "w") as f:
|
|
48
|
+
json.dump(data, f, indent=2)
|
|
49
|
+
|
|
50
|
+
def to_pandas(self, include_html: bool = False):
|
|
51
|
+
"""Export documents to pandas DataFrame with flattened links.
|
|
52
|
+
|
|
53
|
+
One row per link; document metadata is repeated for each link.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
include_html: Include raw HTML source in output (default False)
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
pandas DataFrame with columns: url, title, status_code, domain,
|
|
60
|
+
link_url, link_text, link_type (internal/external), and optional html
|
|
61
|
+
"""
|
|
62
|
+
try:
|
|
63
|
+
import pandas as pd
|
|
64
|
+
except ImportError as e:
|
|
65
|
+
raise ImportError(
|
|
66
|
+
"pandas is required for to_pandas(). Install with: pip install pandas"
|
|
67
|
+
) from e
|
|
68
|
+
|
|
69
|
+
rows = self._flatten_documents(include_html)
|
|
70
|
+
return pd.DataFrame(rows) if rows else pd.DataFrame()
|
|
71
|
+
|
|
72
|
+
def to_polars(self, include_html: bool = False):
|
|
73
|
+
"""Export documents to polars DataFrame with flattened links.
|
|
74
|
+
|
|
75
|
+
One row per link; document metadata is repeated for each link.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
include_html: Include raw HTML source in output (default False)
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
polars DataFrame with columns: url, title, status_code, domain,
|
|
82
|
+
link_url, link_text, link_type (internal/external), and optional html
|
|
83
|
+
"""
|
|
84
|
+
try:
|
|
85
|
+
import polars as pl
|
|
86
|
+
except ImportError as e:
|
|
87
|
+
raise ImportError(
|
|
88
|
+
"polars is required for to_polars(). Install with: pip install polars"
|
|
89
|
+
) from e
|
|
90
|
+
|
|
91
|
+
rows = self._flatten_documents(include_html)
|
|
92
|
+
return pl.DataFrame(rows) if rows else pl.DataFrame()
|
|
93
|
+
|
|
94
|
+
def to_arrow(self, include_html: bool = False):
|
|
95
|
+
"""Export documents to PyArrow Table with flattened links.
|
|
96
|
+
|
|
97
|
+
One row per link; document metadata is repeated for each link.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
include_html: Include raw HTML source in output (default False)
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
pyarrow Table with columns: url, title, status_code, domain,
|
|
104
|
+
link_url, link_text, link_type (internal/external), and optional html
|
|
105
|
+
"""
|
|
106
|
+
try:
|
|
107
|
+
import pyarrow as pa
|
|
108
|
+
except ImportError as e:
|
|
109
|
+
raise ImportError(
|
|
110
|
+
"pyarrow is required for to_arrow(). Install with: pip install pyarrow"
|
|
111
|
+
) from e
|
|
112
|
+
|
|
113
|
+
rows = self._flatten_documents(include_html)
|
|
114
|
+
if not rows:
|
|
115
|
+
return pa.table({})
|
|
116
|
+
|
|
117
|
+
columns: dict[str, list[object]] = {}
|
|
118
|
+
for row in rows:
|
|
119
|
+
for key, value in row.items():
|
|
120
|
+
if key not in columns:
|
|
121
|
+
columns[key] = []
|
|
122
|
+
columns[key].append(value)
|
|
123
|
+
|
|
124
|
+
return pa.table(columns)
|
|
125
|
+
|
|
126
|
+
def _flatten_documents(self, include_html: bool = False) -> list[dict]:
|
|
127
|
+
"""Flatten documents with one row per link.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
include_html: Include raw HTML source in output
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
List of flattened document rows
|
|
134
|
+
"""
|
|
135
|
+
rows = []
|
|
136
|
+
for doc in self.documents:
|
|
137
|
+
base_row = {
|
|
138
|
+
"url": doc.url,
|
|
139
|
+
"title": doc.title,
|
|
140
|
+
"status_code": doc.status_code,
|
|
141
|
+
"domain": doc.domain,
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
if include_html:
|
|
145
|
+
base_row["html"] = doc.source
|
|
146
|
+
|
|
147
|
+
all_links = doc.internal_links + doc.external_links
|
|
148
|
+
|
|
149
|
+
if not all_links:
|
|
150
|
+
row = base_row.copy()
|
|
151
|
+
row["link_url"] = None
|
|
152
|
+
row["link_text"] = None
|
|
153
|
+
row["link_type"] = None
|
|
154
|
+
rows.append(row)
|
|
155
|
+
else:
|
|
156
|
+
for link in all_links:
|
|
157
|
+
row = base_row.copy()
|
|
158
|
+
row["link_url"] = link.url
|
|
159
|
+
row["link_text"] = link.text
|
|
160
|
+
row["link_type"] = (
|
|
161
|
+
"internal" if link in doc.internal_links else "external"
|
|
162
|
+
)
|
|
163
|
+
rows.append(row)
|
|
164
|
+
|
|
165
|
+
return rows
|