linktrace 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
WebCrawler/Crawler.py ADDED
@@ -0,0 +1,396 @@
1
+ import asyncio
2
+ import logging
3
+ import ssl
4
+ from urllib.parse import urljoin, urlparse
5
+
6
+ import aiohttp
7
+ import lxml.etree
8
+ import lxml.html
9
+ import tldextract
10
+
11
+ from WebCrawler.cache import ResponseCache
12
+ from WebCrawler.robots import RobotsManager
13
+
14
+
15
+ class HtmlLink:
16
+ def __init__(self, url: str, text: str) -> None:
17
+ self.url = url
18
+ self.text = text
19
+
20
+ @property
21
+ def schema(self) -> str:
22
+ return urlparse(self.url).scheme
23
+
24
+ @property
25
+ def description(self) -> str:
26
+ return self.text
27
+
28
+ def __repr__(self) -> str:
29
+ return f"htmllink(url={self.url!r})"
30
+
31
+ def __hash__(self) -> int:
32
+ return hash(self.url)
33
+
34
+ def __eq__(self, other: object) -> bool:
35
+ if isinstance(other, HtmlLink):
36
+ return self.url == other.url
37
+ return self.url == other
38
+
39
+ def __ne__(self, other: object) -> bool:
40
+ return not self.__eq__(other)
41
+
42
+ def __lt__(self, other: "HtmlLink | str") -> bool:
43
+ return self.url < (other.url if isinstance(other, HtmlLink) else other)
44
+
45
+ def __gt__(self, other: "HtmlLink | str") -> bool:
46
+ return self.url > (other.url if isinstance(other, HtmlLink) else other)
47
+
48
+
49
+ class Document:
50
+ def __init__(self, url: str, source: str | None) -> None:
51
+ self.url = url
52
+ self.source = source
53
+ self.title: str = ""
54
+ self.internal_links: list[HtmlLink] = []
55
+ self.external_links: list[HtmlLink] = []
56
+ self.links: list[HtmlLink] = []
57
+ self.broken_internal_links: list[BrokenLink] = []
58
+ self.broken_external_links: list[BrokenLink] = []
59
+ self.status_code: int = 0
60
+ self.response_headers: dict[str, str] = {}
61
+ self.dom: object = None
62
+
63
+ @property
64
+ def domain(self) -> str:
65
+ return tldextract.extract(self.url).domain
66
+
67
+
68
+ class CrawlException(Exception):
69
+ def __init__(self, url: str, msg: str, **kw: object) -> None:
70
+ self.url = url
71
+ self.message = msg
72
+ super().__init__(url, msg, **kw)
73
+
74
+
75
+ class BrokenLink(HtmlLink):
76
+ def __init__(self, url: str, status: int) -> None:
77
+ self.status_code = status
78
+ super().__init__(url, str(status))
79
+
80
+
81
+ class Crawler:
82
+ def __init__(
83
+ self,
84
+ log_level: int = logging.DEBUG,
85
+ log_name: str | None = None,
86
+ ssl_verify: bool | str = True,
87
+ verify_hostname: bool = True,
88
+ request_timeout: int = 30,
89
+ cache_dir: str | None = None,
90
+ max_retries: int = 3,
91
+ backoff_factor: int = 2,
92
+ request_delay: float = 0.0,
93
+ user_agent: str = "WebCrawler/0.1.0",
94
+ respect_robots_txt: bool = True,
95
+ ) -> None:
96
+ self._logger = logging.getLogger(log_name if log_name else __name__)
97
+ self._logger.setLevel(log_level)
98
+ self.visited_urls: set[str] = set()
99
+ self._queue: list[str] = []
100
+ self._links: list[HtmlLink] = []
101
+ self._broken_links: list[BrokenLink] = []
102
+
103
+ # Session configuration
104
+ self.session: aiohttp.ClientSession | None = None
105
+ self.ssl_verify: bool | str = ssl_verify # bool or str(path to CA cert)
106
+ self.verify_hostname: bool = verify_hostname
107
+ self.request_timeout: int = request_timeout
108
+ self.max_retries: int = max_retries
109
+ self.backoff_factor: int = backoff_factor
110
+
111
+ # Rate limiting and robots.txt
112
+ self.request_delay: float = request_delay
113
+ self.user_agent: str = user_agent
114
+ self.respect_robots_txt: bool = respect_robots_txt
115
+ self.robots_manager: RobotsManager | None = None
116
+ self._domain_locks: dict[str, asyncio.Lock] = {}
117
+ self._last_request_time: dict[str, float] = {}
118
+
119
+ # Caching (opt-in)
120
+ self.cache: ResponseCache | None = (
121
+ ResponseCache(cache_dir) if cache_dir else None
122
+ )
123
+
124
+ # Cookies handled automatically via CookieJar in session (created lazily)
125
+ self._cookie_jar: aiohttp.CookieJar | None = None
126
+
127
+ async def __aenter__(self) -> "Crawler":
128
+ """Enter async context manager: create and setup session."""
129
+ self.session = await self._create_session()
130
+ return self
131
+
132
+ async def __aexit__(
133
+ self,
134
+ exc_type: type[BaseException] | None,
135
+ exc_val: BaseException | None,
136
+ exc_tb: object,
137
+ ) -> None:
138
+ """Exit async context manager: cleanup session."""
139
+ if self.session:
140
+ await self.session.close()
141
+
142
+ async def _create_session(self) -> aiohttp.ClientSession:
143
+ """Create persistent aiohttp session with SSL context, connector, timeouts."""
144
+ # Create cookie jar now (requires event loop)
145
+ if self._cookie_jar is None:
146
+ self._cookie_jar = aiohttp.CookieJar()
147
+
148
+ ssl_context = self._build_ssl_context()
149
+
150
+ connector = aiohttp.TCPConnector(
151
+ limit=100,
152
+ limit_per_host=10,
153
+ ttl_dns_cache=300,
154
+ ssl=ssl_context,
155
+ )
156
+
157
+ session = aiohttp.ClientSession(
158
+ connector=connector,
159
+ cookie_jar=self._cookie_jar,
160
+ timeout=aiohttp.ClientTimeout(total=self.request_timeout),
161
+ )
162
+
163
+ # Initialize RobotsManager if robots.txt respect enabled
164
+ if self.respect_robots_txt:
165
+ self.robots_manager = RobotsManager(self.user_agent, session)
166
+
167
+ return session
168
+
169
+ def _build_ssl_context(self) -> ssl.SSLContext:
170
+ """Build SSL context with flexible verification options.
171
+
172
+ Supports:
173
+ - ssl_verify=True (default): verify certs with system CA bundle
174
+ - ssl_verify=False: disable verification (insecure, for testing)
175
+ - ssl_verify="/path/to/ca.pem": verify with custom CA bundle (corporate proxies)
176
+ """
177
+ context = ssl.create_default_context()
178
+
179
+ if self.ssl_verify is False:
180
+ # Completely disable verification (INSECURE)
181
+ context.check_hostname = False
182
+ context.verify_mode = ssl.CERT_NONE
183
+ self._logger.warning(
184
+ "SSL certificate verification disabled. "
185
+ "This is insecure and should only be used for testing."
186
+ )
187
+ elif isinstance(self.ssl_verify, str):
188
+ # Load custom CA bundle (corporate proxy scenario)
189
+ try:
190
+ context.load_verify_locations(self.ssl_verify)
191
+ self._logger.debug(f"Loaded custom CA bundle: {self.ssl_verify}")
192
+ except FileNotFoundError:
193
+ self._logger.error(f"CA bundle not found: {self.ssl_verify}")
194
+ raise
195
+ except ssl.SSLError as e:
196
+ self._logger.error(f"Error loading CA bundle: {e}")
197
+ raise
198
+ # else: ssl_verify=True, use defaults (CERT_REQUIRED + system CA bundle)
199
+
200
+ # Optionally disable hostname checking (independent of cert verification)
201
+ if not self.verify_hostname:
202
+ context.check_hostname = False
203
+ self._logger.warning(
204
+ "Hostname verification disabled. "
205
+ "This weakens security and should only be used for testing."
206
+ )
207
+
208
+ return context
209
+
210
+ async def _rate_limit_domain(self, url: str, delay: float) -> None:
211
+ """Enforce per-domain rate limiting using asyncio.Lock.
212
+
213
+ Ensures delay seconds pass between requests to same domain.
214
+ """
215
+ import time
216
+
217
+ domain = urlparse(url).netloc
218
+
219
+ # Get or create lock for domain
220
+ if domain not in self._domain_locks:
221
+ self._domain_locks[domain] = asyncio.Lock()
222
+
223
+ async with self._domain_locks[domain]:
224
+ now = time.time()
225
+ last_time = self._last_request_time.get(domain, 0.0)
226
+ time_since_last = now - last_time
227
+
228
+ if time_since_last < delay:
229
+ wait_time = delay - time_since_last
230
+ self._logger.debug(
231
+ f"Rate limit: waiting {wait_time:.2f}s before {domain}"
232
+ )
233
+ await asyncio.sleep(wait_time)
234
+
235
+ self._last_request_time[domain] = time.time()
236
+
237
+ async def crawl_document_async(self, url: str) -> Document | None:
238
+ """Fetch and parse a document with retries, with optional caching."""
239
+ if not self.session:
240
+ raise RuntimeError(
241
+ "Crawler.session not initialized. "
242
+ "Use 'async with Crawler(...) as crawler:' context manager."
243
+ )
244
+
245
+ # Check cache first
246
+ if self.cache:
247
+ cached = await self.cache.get(url)
248
+ if cached:
249
+ self._logger.debug(f"Cache hit for {url}")
250
+ doc = self.parse_document(url, cached.content)
251
+ doc.status_code = cached.status_code
252
+ doc.response_headers = cached.response_headers
253
+ return doc
254
+
255
+ # Get effective delay (robots.txt or configured)
256
+ if self.respect_robots_txt and self.robots_manager:
257
+ delay = await self.robots_manager.get_crawl_delay(url)
258
+ else:
259
+ delay = self.request_delay
260
+
261
+ # Enforce per-domain rate limiting
262
+ await self._rate_limit_domain(url, delay)
263
+
264
+ # Implement retry logic
265
+ for attempt in range(self.max_retries):
266
+ try:
267
+ headers = {"User-Agent": self.user_agent}
268
+ async with self.session.get(url, headers=headers) as response:
269
+ self._logger.debug(f"Fetching {url} (attempt {attempt + 1})")
270
+ status = response.status
271
+ response_headers = dict(response.headers)
272
+
273
+ if status != 200:
274
+ self._logger.error(f"Failed to fetch {url}: HTTP {status}")
275
+ doc = Document(url, None)
276
+ doc.status_code = status
277
+ doc.response_headers = response_headers
278
+ return doc
279
+
280
+ html = await response.text()
281
+
282
+ doc = self.parse_document(url, html)
283
+ doc.status_code = status
284
+ doc.response_headers = response_headers
285
+
286
+ # Cache successful responses
287
+ if self.cache:
288
+ await self.cache.set(url, status, response_headers, html)
289
+
290
+ return doc
291
+
292
+ except (
293
+ TimeoutError,
294
+ aiohttp.ClientConnectorError,
295
+ aiohttp.ServerTimeoutError,
296
+ ) as e:
297
+ if attempt < self.max_retries - 1:
298
+ wait_time = 2**attempt * self.backoff_factor
299
+ self._logger.warning(
300
+ f"Transient error fetching {url} "
301
+ f"(attempt {attempt + 1}/{self.max_retries}): {e}. "
302
+ f"Retrying in {wait_time}s..."
303
+ )
304
+ await asyncio.sleep(wait_time)
305
+ else:
306
+ self._logger.error(
307
+ f"Failed to fetch {url} after {self.max_retries} attempts: {e}"
308
+ )
309
+ except aiohttp.ClientError as e:
310
+ self._logger.error(f"Client error while fetching {url}: {e}")
311
+ break
312
+ except Exception as e:
313
+ self._logger.error(f"Unexpected error while fetching {url}: {e}")
314
+ raise
315
+
316
+ return None
317
+
318
+ def parse_document(self, url: str, source: str | None) -> Document:
319
+ self._protocol_ = urlparse(url).scheme if not urlparse(url).scheme else "http"
320
+ self.base_uri = urlparse(url).netloc
321
+
322
+ doc = Document(url, source)
323
+ if source is None:
324
+ return doc
325
+
326
+ try:
327
+ dom = lxml.html.fromstring(source)
328
+ if url not in self.visited_urls:
329
+ self.visited_urls.add(url)
330
+
331
+ doc.dom = dom
332
+
333
+ title_element = dom.xpath("//title/text()")
334
+ if title_element:
335
+ doc.title = title_element[0].strip()
336
+
337
+ skip_words = ["#", "\r", "\n", " ", "&amp;"]
338
+ links = [
339
+ link
340
+ for link in dom.xpath("//a/@href/..")
341
+ if link.attrib["href"] and link.attrib["href"][:1] not in skip_words
342
+ ]
343
+
344
+ for link in links:
345
+ link_url = link.attrib["href"]
346
+ link_url = urljoin(url, link_url)
347
+ if urlparse(link_url).scheme not in ["http", "https", "ftp"]:
348
+ continue
349
+
350
+ if "javascript:" not in link_url:
351
+ title = "".join(link.xpath("./text()")).strip()
352
+ if (
353
+ link_url not in self.visited_urls
354
+ and link_url not in self._queue
355
+ ):
356
+ self._links.append(HtmlLink(link_url, title))
357
+
358
+ doc.internal_links = [
359
+ link
360
+ for link in self._links
361
+ if self.get_domain_parts(link.url) == self.get_domain_parts(doc.url)
362
+ or link.url[:1] == "/"
363
+ or link.url[:1] == ""
364
+ ]
365
+ doc.internal_links = list(dict.fromkeys(doc.internal_links))
366
+
367
+ doc.external_links = [
368
+ link
369
+ for link in self._links
370
+ if self.get_domain_parts(link.url) != self.get_domain_parts(doc.url)
371
+ and urlparse(link.url).scheme != ""
372
+ ]
373
+ doc.external_links = list(dict.fromkeys(doc.external_links))
374
+
375
+ doc.links = doc.internal_links + doc.external_links
376
+
377
+ for link in doc.links:
378
+ self.queue_link(link.url)
379
+ except lxml.etree.XMLSyntaxError:
380
+ self._logger.error(
381
+ f"XMLSyntaxError: Invalid source document or truncated source at {url}"
382
+ )
383
+ self._broken_links.append(BrokenLink(url, 0))
384
+
385
+ return doc
386
+
387
+ def relative_to_full(self, url: str) -> str:
388
+ return urljoin(f"{self._protocol_}://{self.base_uri}", url)
389
+
390
+ def queue_link(self, link: str) -> None:
391
+ if link not in self.visited_urls and link not in self._queue:
392
+ self._queue.append(link)
393
+
394
+ @staticmethod
395
+ def get_domain_parts(url: str) -> str:
396
+ return urlparse(url).netloc
@@ -0,0 +1,165 @@
1
+ """Export crawled documents to various formats: JSON, Pandas, Polars, PyArrow."""
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ from WebCrawler.Crawler import Document
7
+
8
+
9
+ class Serializers:
10
+ """Export documents with flattened links and rich metadata."""
11
+
12
+ def __init__(self, documents: list[Document]) -> None:
13
+ """Initialize with a list of crawled documents.
14
+
15
+ Args:
16
+ documents: List of Document objects from Spider.run_async()
17
+ """
18
+ self.documents = documents
19
+
20
+ def to_json(self, output_path: str, include_html: bool = False) -> None:
21
+ """Export documents to JSON file with nested link structure.
22
+
23
+ Args:
24
+ output_path: Path to write JSON file
25
+ include_html: Include raw HTML source in output (default False)
26
+ """
27
+ data = []
28
+ for doc in self.documents:
29
+ doc_data = {
30
+ "url": doc.url,
31
+ "title": doc.title,
32
+ "status_code": doc.status_code,
33
+ "domain": doc.domain,
34
+ "response_headers": doc.response_headers,
35
+ "internal_links": [
36
+ {"url": link.url, "text": link.text} for link in doc.internal_links
37
+ ],
38
+ "external_links": [
39
+ {"url": link.url, "text": link.text} for link in doc.external_links
40
+ ],
41
+ }
42
+ if include_html and doc.source:
43
+ doc_data["html"] = doc.source
44
+ data.append(doc_data)
45
+
46
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
47
+ with open(output_path, "w") as f:
48
+ json.dump(data, f, indent=2)
49
+
50
+ def to_pandas(self, include_html: bool = False):
51
+ """Export documents to pandas DataFrame with flattened links.
52
+
53
+ One row per link; document metadata is repeated for each link.
54
+
55
+ Args:
56
+ include_html: Include raw HTML source in output (default False)
57
+
58
+ Returns:
59
+ pandas DataFrame with columns: url, title, status_code, domain,
60
+ link_url, link_text, link_type (internal/external), and optional html
61
+ """
62
+ try:
63
+ import pandas as pd
64
+ except ImportError as e:
65
+ raise ImportError(
66
+ "pandas is required for to_pandas(). Install with: pip install pandas"
67
+ ) from e
68
+
69
+ rows = self._flatten_documents(include_html)
70
+ return pd.DataFrame(rows) if rows else pd.DataFrame()
71
+
72
+ def to_polars(self, include_html: bool = False):
73
+ """Export documents to polars DataFrame with flattened links.
74
+
75
+ One row per link; document metadata is repeated for each link.
76
+
77
+ Args:
78
+ include_html: Include raw HTML source in output (default False)
79
+
80
+ Returns:
81
+ polars DataFrame with columns: url, title, status_code, domain,
82
+ link_url, link_text, link_type (internal/external), and optional html
83
+ """
84
+ try:
85
+ import polars as pl
86
+ except ImportError as e:
87
+ raise ImportError(
88
+ "polars is required for to_polars(). Install with: pip install polars"
89
+ ) from e
90
+
91
+ rows = self._flatten_documents(include_html)
92
+ return pl.DataFrame(rows) if rows else pl.DataFrame()
93
+
94
+ def to_arrow(self, include_html: bool = False):
95
+ """Export documents to PyArrow Table with flattened links.
96
+
97
+ One row per link; document metadata is repeated for each link.
98
+
99
+ Args:
100
+ include_html: Include raw HTML source in output (default False)
101
+
102
+ Returns:
103
+ pyarrow Table with columns: url, title, status_code, domain,
104
+ link_url, link_text, link_type (internal/external), and optional html
105
+ """
106
+ try:
107
+ import pyarrow as pa
108
+ except ImportError as e:
109
+ raise ImportError(
110
+ "pyarrow is required for to_arrow(). Install with: pip install pyarrow"
111
+ ) from e
112
+
113
+ rows = self._flatten_documents(include_html)
114
+ if not rows:
115
+ return pa.table({})
116
+
117
+ columns: dict[str, list[object]] = {}
118
+ for row in rows:
119
+ for key, value in row.items():
120
+ if key not in columns:
121
+ columns[key] = []
122
+ columns[key].append(value)
123
+
124
+ return pa.table(columns)
125
+
126
+ def _flatten_documents(self, include_html: bool = False) -> list[dict]:
127
+ """Flatten documents with one row per link.
128
+
129
+ Args:
130
+ include_html: Include raw HTML source in output
131
+
132
+ Returns:
133
+ List of flattened document rows
134
+ """
135
+ rows = []
136
+ for doc in self.documents:
137
+ base_row = {
138
+ "url": doc.url,
139
+ "title": doc.title,
140
+ "status_code": doc.status_code,
141
+ "domain": doc.domain,
142
+ }
143
+
144
+ if include_html:
145
+ base_row["html"] = doc.source
146
+
147
+ all_links = doc.internal_links + doc.external_links
148
+
149
+ if not all_links:
150
+ row = base_row.copy()
151
+ row["link_url"] = None
152
+ row["link_text"] = None
153
+ row["link_type"] = None
154
+ rows.append(row)
155
+ else:
156
+ for link in all_links:
157
+ row = base_row.copy()
158
+ row["link_url"] = link.url
159
+ row["link_text"] = link.text
160
+ row["link_type"] = (
161
+ "internal" if link in doc.internal_links else "external"
162
+ )
163
+ rows.append(row)
164
+
165
+ return rows