sitewise-crawler 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sitewise_crawler-0.1.0/PKG-INFO +100 -0
- sitewise_crawler-0.1.0/README.md +75 -0
- sitewise_crawler-0.1.0/pyproject.toml +39 -0
- sitewise_crawler-0.1.0/setup.cfg +4 -0
- sitewise_crawler-0.1.0/src/sitewise_crawler/__init__.py +21 -0
- sitewise_crawler-0.1.0/src/sitewise_crawler/analyzer.py +162 -0
- sitewise_crawler-0.1.0/src/sitewise_crawler/crawler.py +132 -0
- sitewise_crawler-0.1.0/src/sitewise_crawler/extractors.py +113 -0
- sitewise_crawler-0.1.0/src/sitewise_crawler/fetchers.py +98 -0
- sitewise_crawler-0.1.0/src/sitewise_crawler/models.py +106 -0
- sitewise_crawler-0.1.0/src/sitewise_crawler.egg-info/PKG-INFO +100 -0
- sitewise_crawler-0.1.0/src/sitewise_crawler.egg-info/SOURCES.txt +15 -0
- sitewise_crawler-0.1.0/src/sitewise_crawler.egg-info/dependency_links.txt +1 -0
- sitewise_crawler-0.1.0/src/sitewise_crawler.egg-info/requires.txt +11 -0
- sitewise_crawler-0.1.0/src/sitewise_crawler.egg-info/top_level.txt +1 -0
- sitewise_crawler-0.1.0/tests/test_analyzer.py +48 -0
- sitewise_crawler-0.1.0/tests/test_local.py +38 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sitewise-crawler
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A flexible and advanced web crawler for modern SPAs and traditional websites.
|
|
5
|
+
Author-email: TarXemo <info@tarxemo.com>
|
|
6
|
+
Project-URL: Homepage, https://github.com/tarxemo/sitewise-crawler
|
|
7
|
+
Project-URL: Bug Tracker, https://github.com/tarxemo/sitewise-crawler/issues
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
Requires-Dist: requests
|
|
15
|
+
Requires-Dist: beautifulsoup4
|
|
16
|
+
Requires-Dist: playwright
|
|
17
|
+
Requires-Dist: trafilatura
|
|
18
|
+
Requires-Dist: lxml
|
|
19
|
+
Requires-Dist: pydantic
|
|
20
|
+
Requires-Dist: aiohttp
|
|
21
|
+
Requires-Dist: tenacity
|
|
22
|
+
Requires-Dist: pypdf
|
|
23
|
+
Requires-Dist: python-docx
|
|
24
|
+
Requires-Dist: groq
|
|
25
|
+
|
|
26
|
+
# Sitewise Crawler š·ļø
|
|
27
|
+
|
|
28
|
+
An advanced, flexible, and production-ready web crawler for modern websites. Automatically detects SPAs (Single Page Applications) and switches between fast `requests` fetching and full JavaScript rendering with `Playwright`.
|
|
29
|
+
|
|
30
|
+
## ⨠Features
|
|
31
|
+
|
|
32
|
+
- š **Hybrid Rendering**: Automatically detects React, Vue, Angular, and Next.js to switch rendering engines on the fly.
|
|
33
|
+
- š§ **Smart Extraction**: Built-in main content extraction that removes headers, footers, and sidebars.
|
|
34
|
+
- š **SPA Link Discovery**: Discovers links even in complex client-side routers.
|
|
35
|
+
- š ļø **Fully Configurable**: Control depth, concurrency, rate limits, and custom wait selectors.
|
|
36
|
+
- š **Pydantic Models**: Type-safe configuration and results.
|
|
37
|
+
|
|
38
|
+
## š¦ Installation
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install sitewise-crawler
|
|
42
|
+
playwright install chromium
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## š Quick Start
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
import asyncio
|
|
49
|
+
from sitewise_crawler import SPACrawler, CrawlerConfig
|
|
50
|
+
|
|
51
|
+
async def main():
|
|
52
|
+
# 1. Configure the crawler
|
|
53
|
+
config = CrawlerConfig(
|
|
54
|
+
start_url="https://example.com",
|
|
55
|
+
max_depth=2,
|
|
56
|
+
max_pages=10,
|
|
57
|
+
use_playwright=True,
|
|
58
|
+
headless=True
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# 2. Initialize and run
|
|
62
|
+
crawler = SPACrawler(config)
|
|
63
|
+
|
|
64
|
+
# Optional: Add a callback for each page crawled
|
|
65
|
+
crawler.on_page_crawled = lambda page: print(f"Crawled: {page.url} | Title: {page.title}")
|
|
66
|
+
|
|
67
|
+
result = await crawler.crawl()
|
|
68
|
+
|
|
69
|
+
# 3. Process results
|
|
70
|
+
if result.success:
|
|
71
|
+
print(f"\nā
Crawl complete! Found {result.total_pages} pages.")
|
|
72
|
+
for page in result.pages_all:
|
|
73
|
+
print(f"- {page.url} ({len(page.content)} chars)")
|
|
74
|
+
|
|
75
|
+
if __name__ == "__main__":
|
|
76
|
+
asyncio.run(main())
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## āļø Configuration Options
|
|
80
|
+
|
|
81
|
+
The `CrawlerConfig` class supports the following parameters:
|
|
82
|
+
|
|
83
|
+
| Parameter | Type | Default | Description |
|
|
84
|
+
|-----------|------|---------|-------------|
|
|
85
|
+
| `start_url` | `str` | *Required* | The entry point for the crawler. |
|
|
86
|
+
| `max_depth` | `int` | `3` | Maximum crawl depth from the start URL. |
|
|
87
|
+
| `max_pages` | `int` | `100` | Stop crawling after this many pages. |
|
|
88
|
+
| `use_playwright` | `bool` | `True` | Enable JavaScript rendering for SPAs. |
|
|
89
|
+
| `headless` | `bool` | `True` | Run browser in headless mode. |
|
|
90
|
+
| `rate_limit_delay` | `float` | `1.0` | Seconds to wait between requests. |
|
|
91
|
+
| `wait_for_selector`| `str` | `None` | CSS selector to wait for before extracting SPA content. |
|
|
92
|
+
|
|
93
|
+
## š¤ Contributing
|
|
94
|
+
|
|
95
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
96
|
+
|
|
97
|
+
## š License
|
|
98
|
+
|
|
99
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
100
|
+
# sitewise_crawler
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# Sitewise Crawler š·ļø
|
|
2
|
+
|
|
3
|
+
An advanced, flexible, and production-ready web crawler for modern websites. Automatically detects SPAs (Single Page Applications) and switches between fast `requests` fetching and full JavaScript rendering with `Playwright`.
|
|
4
|
+
|
|
5
|
+
## ⨠Features
|
|
6
|
+
|
|
7
|
+
- š **Hybrid Rendering**: Automatically detects React, Vue, Angular, and Next.js to switch rendering engines on the fly.
|
|
8
|
+
- š§ **Smart Extraction**: Built-in main content extraction that removes headers, footers, and sidebars.
|
|
9
|
+
- š **SPA Link Discovery**: Discovers links even in complex client-side routers.
|
|
10
|
+
- š ļø **Fully Configurable**: Control depth, concurrency, rate limits, and custom wait selectors.
|
|
11
|
+
- š **Pydantic Models**: Type-safe configuration and results.
|
|
12
|
+
|
|
13
|
+
## š¦ Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install sitewise-crawler
|
|
17
|
+
playwright install chromium
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## š Quick Start
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
import asyncio
|
|
24
|
+
from sitewise_crawler import SPACrawler, CrawlerConfig
|
|
25
|
+
|
|
26
|
+
async def main():
|
|
27
|
+
# 1. Configure the crawler
|
|
28
|
+
config = CrawlerConfig(
|
|
29
|
+
start_url="https://example.com",
|
|
30
|
+
max_depth=2,
|
|
31
|
+
max_pages=10,
|
|
32
|
+
use_playwright=True,
|
|
33
|
+
headless=True
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# 2. Initialize and run
|
|
37
|
+
crawler = SPACrawler(config)
|
|
38
|
+
|
|
39
|
+
# Optional: Add a callback for each page crawled
|
|
40
|
+
crawler.on_page_crawled = lambda page: print(f"Crawled: {page.url} | Title: {page.title}")
|
|
41
|
+
|
|
42
|
+
result = await crawler.crawl()
|
|
43
|
+
|
|
44
|
+
# 3. Process results
|
|
45
|
+
if result.success:
|
|
46
|
+
print(f"\nā
Crawl complete! Found {result.total_pages} pages.")
|
|
47
|
+
for page in result.pages_all:
|
|
48
|
+
print(f"- {page.url} ({len(page.content)} chars)")
|
|
49
|
+
|
|
50
|
+
if __name__ == "__main__":
|
|
51
|
+
asyncio.run(main())
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## āļø Configuration Options
|
|
55
|
+
|
|
56
|
+
The `CrawlerConfig` class supports the following parameters:
|
|
57
|
+
|
|
58
|
+
| Parameter | Type | Default | Description |
|
|
59
|
+
|-----------|------|---------|-------------|
|
|
60
|
+
| `start_url` | `str` | *Required* | The entry point for the crawler. |
|
|
61
|
+
| `max_depth` | `int` | `3` | Maximum crawl depth from the start URL. |
|
|
62
|
+
| `max_pages` | `int` | `100` | Stop crawling after this many pages. |
|
|
63
|
+
| `use_playwright` | `bool` | `True` | Enable JavaScript rendering for SPAs. |
|
|
64
|
+
| `headless` | `bool` | `True` | Run browser in headless mode. |
|
|
65
|
+
| `rate_limit_delay` | `float` | `1.0` | Seconds to wait between requests. |
|
|
66
|
+
| `wait_for_selector`| `str` | `None` | CSS selector to wait for before extracting SPA content. |
|
|
67
|
+
|
|
68
|
+
## š¤ Contributing
|
|
69
|
+
|
|
70
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
71
|
+
|
|
72
|
+
## š License
|
|
73
|
+
|
|
74
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
75
|
+
# sitewise_crawler
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "sitewise-crawler"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name="TarXemo", email="info@tarxemo.com" },
|
|
10
|
+
]
|
|
11
|
+
description = "A flexible and advanced web crawler for modern SPAs and traditional websites."
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.10"
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Operating System :: OS Independent",
|
|
18
|
+
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
|
|
19
|
+
]
|
|
20
|
+
dependencies = [
|
|
21
|
+
"requests",
|
|
22
|
+
"beautifulsoup4",
|
|
23
|
+
"playwright",
|
|
24
|
+
"trafilatura",
|
|
25
|
+
"lxml",
|
|
26
|
+
"pydantic",
|
|
27
|
+
"aiohttp",
|
|
28
|
+
"tenacity",
|
|
29
|
+
"pypdf",
|
|
30
|
+
"python-docx",
|
|
31
|
+
"groq",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.urls]
|
|
35
|
+
"Homepage" = "https://github.com/tarxemo/sitewise-crawler"
|
|
36
|
+
"Bug Tracker" = "https://github.com/tarxemo/sitewise-crawler/issues"
|
|
37
|
+
|
|
38
|
+
[tool.setuptools.packages.find]
|
|
39
|
+
where = ["src"]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from .crawler import SPACrawler
|
|
2
|
+
from .models import CrawlerConfig, PageData, CrawlResult, UserInsight, CategoryScore
|
|
3
|
+
from .fetchers import RequestsFetcher, PlaywrightFetcher
|
|
4
|
+
from .extractors import LinkExtractor, ContentExtractor, SPADetector
|
|
5
|
+
from .analyzer import InsightEngine
|
|
6
|
+
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
__all__ = [
|
|
9
|
+
"SPACrawler",
|
|
10
|
+
"CrawlerConfig",
|
|
11
|
+
"PageData",
|
|
12
|
+
"CrawlResult",
|
|
13
|
+
"UserInsight",
|
|
14
|
+
"CategoryScore",
|
|
15
|
+
"RequestsFetcher",
|
|
16
|
+
"PlaywrightFetcher",
|
|
17
|
+
"LinkExtractor",
|
|
18
|
+
"ContentExtractor",
|
|
19
|
+
"SPADetector",
|
|
20
|
+
"InsightEngine",
|
|
21
|
+
]
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import json
|
|
3
|
+
import asyncio
|
|
4
|
+
from typing import List, Optional, Dict, Any
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from groq import Groq
|
|
7
|
+
from .crawler import SPACrawler
|
|
8
|
+
from .models import CrawlerConfig, UserInsight, CategoryScore, PageData
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
class InsightEngine:
|
|
13
|
+
"""
|
|
14
|
+
Advanced engine for analyzing user behavior based on visited content.
|
|
15
|
+
Uses AI (Groq) to provide deep insights.
|
|
16
|
+
"""
|
|
17
|
+
def __init__(self, api_key: str, model: str = "llama-3.1-70b-versatile"):
|
|
18
|
+
self.client = Groq(api_key=api_key)
|
|
19
|
+
self.model = model
|
|
20
|
+
|
|
21
|
+
async def analyze_user_behavior(
|
|
22
|
+
self,
|
|
23
|
+
user_id: str,
|
|
24
|
+
urls: List[str],
|
|
25
|
+
crawler_config: Optional[CrawlerConfig] = None
|
|
26
|
+
) -> UserInsight:
|
|
27
|
+
"""
|
|
28
|
+
Scrapes a list of URLs and performs intensive AI analysis on the content.
|
|
29
|
+
"""
|
|
30
|
+
logger.info(f"Starting behavioral analysis for user {user_id} with {len(urls)} URLs")
|
|
31
|
+
|
|
32
|
+
# 1. Scrape all URLs
|
|
33
|
+
if not crawler_config:
|
|
34
|
+
crawler_config = CrawlerConfig(start_url=urls[0], max_pages=len(urls))
|
|
35
|
+
|
|
36
|
+
crawler = SPACrawler(crawler_config)
|
|
37
|
+
|
|
38
|
+
tasks = [crawler.scrape_page(url) for url in urls]
|
|
39
|
+
pages: List[PageData] = await asyncio.gather(*tasks)
|
|
40
|
+
pages = [p for p in pages if p is not None]
|
|
41
|
+
|
|
42
|
+
if not pages:
|
|
43
|
+
raise ValueError("No content could be extracted from the provided URLs.")
|
|
44
|
+
|
|
45
|
+
# 2. Consolidate content for analysis
|
|
46
|
+
# We take chunks of content from each page to stay within LLM context limits
|
|
47
|
+
consolidated_content = ""
|
|
48
|
+
for page in pages:
|
|
49
|
+
preview = page.content[:1500] # Take first 1500 chars from each page
|
|
50
|
+
consolidated_content += f"\n--- URL: {page.url} ---\nTitle: {page.title}\nContent: {preview}\n"
|
|
51
|
+
|
|
52
|
+
# 3. Perform AI Analysis
|
|
53
|
+
analysis_result = await self._call_ai_analyzer(consolidated_content)
|
|
54
|
+
|
|
55
|
+
# 4. Construct UserInsight model
|
|
56
|
+
from .models import Entity
|
|
57
|
+
return UserInsight(
|
|
58
|
+
user_id=user_id,
|
|
59
|
+
total_urls_analyzed=len(pages),
|
|
60
|
+
# Core
|
|
61
|
+
primary_interests=analysis_result.get("primary_interests", []),
|
|
62
|
+
top_categories=[
|
|
63
|
+
CategoryScore(category=c["name"], score=c["score"])
|
|
64
|
+
for c in analysis_result.get("top_categories", [])
|
|
65
|
+
],
|
|
66
|
+
content_languages=analysis_result.get("content_languages", []),
|
|
67
|
+
content_complexity_score=analysis_result.get("content_complexity_score", 0.0),
|
|
68
|
+
technical_proficiency_level=analysis_result.get("technical_proficiency_level", "Unknown"),
|
|
69
|
+
|
|
70
|
+
# Sentiment
|
|
71
|
+
overall_sentiment=analysis_result.get("overall_sentiment", "Neutral"),
|
|
72
|
+
average_sentiment_score=analysis_result.get("average_sentiment_score", 0.0),
|
|
73
|
+
stress_or_anxiety_indicators=analysis_result.get("stress_or_anxiety_indicators", 0.0),
|
|
74
|
+
|
|
75
|
+
# Behavior
|
|
76
|
+
estimated_intent=analysis_result.get("estimated_intent", "Unknown"),
|
|
77
|
+
productivity_rating=analysis_result.get("productivity_rating", 0.0),
|
|
78
|
+
time_wasting_probability=analysis_result.get("time_wasting_probability", 0.0),
|
|
79
|
+
focus_score=analysis_result.get("focus_score", 0.0),
|
|
80
|
+
|
|
81
|
+
# Academic/Career
|
|
82
|
+
educational_alignment=analysis_result.get("educational_alignment", 0.0),
|
|
83
|
+
academic_relevance_score=analysis_result.get("academic_relevance_score", 0.0),
|
|
84
|
+
career_development_focus=analysis_result.get("career_development_focus", 0.0),
|
|
85
|
+
|
|
86
|
+
# Commercial
|
|
87
|
+
shopping_intent_score=analysis_result.get("shopping_intent_score", 0.0),
|
|
88
|
+
likely_in_market_for=analysis_result.get("likely_in_market_for", []),
|
|
89
|
+
|
|
90
|
+
# Entities
|
|
91
|
+
top_entities=[
|
|
92
|
+
Entity(**e) for e in analysis_result.get("top_entities", [])
|
|
93
|
+
],
|
|
94
|
+
|
|
95
|
+
# Risk
|
|
96
|
+
risk_assessment_summary=analysis_result.get("risk_assessment_summary"),
|
|
97
|
+
nsfw_or_inappropriate_probability=analysis_result.get("nsfw_or_inappropriate_probability", 0.0),
|
|
98
|
+
|
|
99
|
+
# Summary
|
|
100
|
+
behavioral_summary=analysis_result.get("behavioral_summary", "No summary available."),
|
|
101
|
+
actionable_recommendation=analysis_result.get("actionable_recommendation"),
|
|
102
|
+
raw_ai_response=analysis_result
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
async def _call_ai_analyzer(self, content: str) -> Dict[str, Any]:
|
|
106
|
+
"""Calls Groq to analyze the consolidated content."""
|
|
107
|
+
prompt = f"""
|
|
108
|
+
You are a highly advanced Behavioral Profiling AI working for an institutional analytics platform.
|
|
109
|
+
Analyze the following browsing history content and provide a massive, highly detailed behavioral profile of the user.
|
|
110
|
+
Return ONLY a JSON object exactly matching this structure (fill in the values based on your analysis):
|
|
111
|
+
{{
|
|
112
|
+
"primary_interests": ["topic1", "topic2", "topic3"],
|
|
113
|
+
"top_categories": [{{"name": "CategoryName", "score": 0.9}}],
|
|
114
|
+
"content_languages": ["English"],
|
|
115
|
+
"content_complexity_score": 0.8,
|
|
116
|
+
"technical_proficiency_level": "Intermediate",
|
|
117
|
+
"overall_sentiment": "Positive",
|
|
118
|
+
"average_sentiment_score": 0.5,
|
|
119
|
+
"stress_or_anxiety_indicators": 0.1,
|
|
120
|
+
"estimated_intent": "Research",
|
|
121
|
+
"productivity_rating": 0.85,
|
|
122
|
+
"time_wasting_probability": 0.15,
|
|
123
|
+
"focus_score": 0.9,
|
|
124
|
+
"educational_alignment": 0.9,
|
|
125
|
+
"academic_relevance_score": 0.8,
|
|
126
|
+
"career_development_focus": 0.7,
|
|
127
|
+
"shopping_intent_score": 0.0,
|
|
128
|
+
"likely_in_market_for": [],
|
|
129
|
+
"top_entities": [{{"name": "OpenAI", "type": "Organization", "frequency": 5}}],
|
|
130
|
+
"risk_assessment_summary": "Low risk. Professional content.",
|
|
131
|
+
"nsfw_or_inappropriate_probability": 0.0,
|
|
132
|
+
"behavioral_summary": "A comprehensive paragraph summarizing habits.",
|
|
133
|
+
"actionable_recommendation": "Suggest resource X based on interest Y."
|
|
134
|
+
}}
|
|
135
|
+
|
|
136
|
+
Content to analyze:
|
|
137
|
+
{content}
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
chat_completion = self.client.chat.completions.create(
|
|
142
|
+
messages=[
|
|
143
|
+
{
|
|
144
|
+
"role": "system",
|
|
145
|
+
"content": "You are a specialized User Behavior Analyst. You extract deep, multi-dimensional insights from web content history. Output strictly valid JSON."
|
|
146
|
+
},
|
|
147
|
+
{
|
|
148
|
+
"role": "user",
|
|
149
|
+
"content": prompt,
|
|
150
|
+
}
|
|
151
|
+
],
|
|
152
|
+
model=self.model,
|
|
153
|
+
response_format={"type": "json_object"}
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
return json.loads(chat_completion.choices[0].message.content)
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.error(f"AI Analysis failed: {e}")
|
|
159
|
+
return {
|
|
160
|
+
"behavioral_summary": f"Failed to perform AI analysis: {str(e)}",
|
|
161
|
+
"overall_sentiment": "Unknown"
|
|
162
|
+
}
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import time
|
|
4
|
+
from collections import deque
|
|
5
|
+
from typing import Set, List, Optional, Callable
|
|
6
|
+
from .models import CrawlerConfig, PageData, CrawlResult
|
|
7
|
+
from .fetchers import RequestsFetcher, PlaywrightFetcher
|
|
8
|
+
from .extractors import LinkExtractor, ContentExtractor, SPADetector
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
class SPACrawler:
|
|
13
|
+
"""
|
|
14
|
+
Advanced Crawler Engine that automatically handles SPAs and traditional websites.
|
|
15
|
+
"""
|
|
16
|
+
def __init__(self, config: CrawlerConfig):
|
|
17
|
+
self.config = config
|
|
18
|
+
self.visited: Set[str] = set()
|
|
19
|
+
self.queue = deque([(config.start_url, 0)])
|
|
20
|
+
self.results: List[PageData] = []
|
|
21
|
+
self.failed_urls: List[str] = []
|
|
22
|
+
|
|
23
|
+
self.requests_fetcher = RequestsFetcher()
|
|
24
|
+
self.playwright_fetcher = PlaywrightFetcher()
|
|
25
|
+
|
|
26
|
+
# User-defined hook for page processing
|
|
27
|
+
self.on_page_crawled: Optional[Callable[[PageData], None]] = None
|
|
28
|
+
|
|
29
|
+
def _should_crawl(self, url: str, depth: int) -> bool:
|
|
30
|
+
if url in self.visited:
|
|
31
|
+
return False
|
|
32
|
+
if depth > self.config.max_depth:
|
|
33
|
+
return False
|
|
34
|
+
if len(self.results) >= self.config.max_pages:
|
|
35
|
+
return False
|
|
36
|
+
if self.config.allowed_domains and not LinkExtractor.is_same_domain(url, self.config.start_url):
|
|
37
|
+
return False
|
|
38
|
+
return True
|
|
39
|
+
|
|
40
|
+
async def crawl(self) -> CrawlResult:
|
|
41
|
+
"""Starts the full BFS crawling process based on config."""
|
|
42
|
+
start_time = time.time()
|
|
43
|
+
logger.info(f"Starting crawl for {self.config.start_url}")
|
|
44
|
+
|
|
45
|
+
while self.queue and len(self.results) < self.config.max_pages:
|
|
46
|
+
url, depth = self.queue.popleft()
|
|
47
|
+
|
|
48
|
+
if not self._should_crawl(url, depth):
|
|
49
|
+
continue
|
|
50
|
+
|
|
51
|
+
self.visited.add(url)
|
|
52
|
+
|
|
53
|
+
# Step 1: Extract the page
|
|
54
|
+
page_data = await self.scrape_page(url, depth=depth)
|
|
55
|
+
|
|
56
|
+
if not page_data:
|
|
57
|
+
self.failed_urls.append(url)
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
self.results.append(page_data)
|
|
61
|
+
|
|
62
|
+
# Callback
|
|
63
|
+
if self.on_page_crawled:
|
|
64
|
+
self.on_page_crawled(page_data)
|
|
65
|
+
|
|
66
|
+
# Step 2: Add new links to queue (only if we haven't hit max depth)
|
|
67
|
+
if depth < self.config.max_depth:
|
|
68
|
+
for link in page_data.links:
|
|
69
|
+
if LinkExtractor.is_same_domain(link, self.config.start_url):
|
|
70
|
+
self.queue.append((link, depth + 1))
|
|
71
|
+
|
|
72
|
+
# Rate limiting
|
|
73
|
+
await asyncio.sleep(self.config.rate_limit_delay)
|
|
74
|
+
|
|
75
|
+
# Cleanup
|
|
76
|
+
await self.playwright_fetcher.close()
|
|
77
|
+
|
|
78
|
+
duration = time.time() - start_time
|
|
79
|
+
return CrawlResult(
|
|
80
|
+
success=len(self.results) > 0,
|
|
81
|
+
pages_all=self.results,
|
|
82
|
+
failed_urls=self.failed_urls,
|
|
83
|
+
duration_seconds=duration,
|
|
84
|
+
total_pages=len(self.results)
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
async def scrape_page(self, url: str, depth: int = 0) -> Optional[PageData]:
|
|
88
|
+
"""
|
|
89
|
+
Directly extracts data from a single URL.
|
|
90
|
+
Supports HTML, SPAs, PDFs, and Word Documents.
|
|
91
|
+
"""
|
|
92
|
+
logger.info(f"Scraping page: {url}")
|
|
93
|
+
|
|
94
|
+
# Step 1: Fetch content
|
|
95
|
+
# We start with Requests for efficiency and document handling
|
|
96
|
+
content, status, title, content_type = await self.requests_fetcher.fetch(url, self.config)
|
|
97
|
+
|
|
98
|
+
if not content:
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
is_spa = False
|
|
102
|
+
is_binary = False
|
|
103
|
+
|
|
104
|
+
# Step 2: Handle based on content type
|
|
105
|
+
if 'html' in content_type:
|
|
106
|
+
# Check for SPA
|
|
107
|
+
if SPADetector.is_spa(content) and self.config.use_playwright:
|
|
108
|
+
logger.info(f"SPA detected for {url}, switching to Playwright")
|
|
109
|
+
is_spa = True
|
|
110
|
+
content, status, title, content_type = await self.playwright_fetcher.fetch(url, self.config)
|
|
111
|
+
|
|
112
|
+
# Extract HTML content
|
|
113
|
+
text_content = ContentExtractor.clean_text(content)
|
|
114
|
+
links = LinkExtractor.extract_links(content, url)
|
|
115
|
+
else:
|
|
116
|
+
# Handle binary documents
|
|
117
|
+
logger.info(f"Binary document detected ({content_type}) for {url}")
|
|
118
|
+
is_binary = True
|
|
119
|
+
text_content = ContentExtractor.extract_from_binary(content, content_type)
|
|
120
|
+
links = [] # Binary files usually don't have crawlable links for our BFS
|
|
121
|
+
|
|
122
|
+
return PageData(
|
|
123
|
+
url=url,
|
|
124
|
+
title=title or url.split('/')[-1],
|
|
125
|
+
content=text_content,
|
|
126
|
+
html=content if (isinstance(content, str) and self.config.max_pages < 10) else None,
|
|
127
|
+
depth=depth,
|
|
128
|
+
status_code=status,
|
|
129
|
+
is_spa=is_spa,
|
|
130
|
+
links=links,
|
|
131
|
+
metadata={'content_type': content_type, 'is_binary': is_binary}
|
|
132
|
+
)
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from urllib.parse import urljoin, urlparse, urlunparse
|
|
3
|
+
from typing import List, Set
|
|
4
|
+
from bs4 import BeautifulSoup
|
|
5
|
+
import trafilatura
|
|
6
|
+
|
|
7
|
+
class LinkExtractor:
|
|
8
|
+
@staticmethod
|
|
9
|
+
def normalize_url(url: str, base_url: str) -> str:
|
|
10
|
+
"""Normalize URL and remove fragments."""
|
|
11
|
+
parsed = urlparse(url)
|
|
12
|
+
if not parsed.netloc:
|
|
13
|
+
url = urljoin(base_url, url)
|
|
14
|
+
parsed = urlparse(url)
|
|
15
|
+
|
|
16
|
+
# Remove fragments and normalize
|
|
17
|
+
parsed = parsed._replace(fragment='')
|
|
18
|
+
normalized = urlunparse(parsed)
|
|
19
|
+
return normalized.rstrip('/')
|
|
20
|
+
|
|
21
|
+
@staticmethod
|
|
22
|
+
def is_same_domain(url: str, base_url: str) -> bool:
|
|
23
|
+
return urlparse(url).netloc == urlparse(base_url).netloc
|
|
24
|
+
|
|
25
|
+
@staticmethod
|
|
26
|
+
def extract_links(html: str, base_url: str) -> List[str]:
|
|
27
|
+
soup = BeautifulSoup(html, 'lxml')
|
|
28
|
+
links = set()
|
|
29
|
+
for a in soup.find_all('a', href=True):
|
|
30
|
+
links.add(a['href'])
|
|
31
|
+
|
|
32
|
+
normalized = []
|
|
33
|
+
for link in links:
|
|
34
|
+
try:
|
|
35
|
+
norm = LinkExtractor.normalize_url(link, base_url)
|
|
36
|
+
normalized.append(norm)
|
|
37
|
+
except:
|
|
38
|
+
continue
|
|
39
|
+
return list(set(normalized))
|
|
40
|
+
|
|
41
|
+
class ContentExtractor:
|
|
42
|
+
@staticmethod
|
|
43
|
+
def clean_text(html: str) -> str:
|
|
44
|
+
"""Extract main content text, removing boilerplates."""
|
|
45
|
+
# Use trafilatura for high-quality extraction
|
|
46
|
+
content = trafilatura.extract(html, include_comments=False, include_tables=True, no_fallback=False)
|
|
47
|
+
if not content:
|
|
48
|
+
# Fallback to BeautifulSoup if trafilatura fails
|
|
49
|
+
soup = BeautifulSoup(html, 'lxml')
|
|
50
|
+
for script in soup(["script", "style", "nav", "footer", "header"]):
|
|
51
|
+
script.decompose()
|
|
52
|
+
content = soup.get_text(separator=' ')
|
|
53
|
+
|
|
54
|
+
# Final cleanup
|
|
55
|
+
content = re.sub(r'\s+', ' ', content).strip()
|
|
56
|
+
return content
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def extract_from_binary(content: bytes, content_type: str) -> str:
|
|
60
|
+
"""Extract text from non-HTML binary files (PDF, Docx)."""
|
|
61
|
+
import io
|
|
62
|
+
|
|
63
|
+
# Handle PDF
|
|
64
|
+
if 'pdf' in content_type:
|
|
65
|
+
try:
|
|
66
|
+
from pypdf import PdfReader
|
|
67
|
+
reader = PdfReader(io.BytesIO(content))
|
|
68
|
+
text = ""
|
|
69
|
+
for page in reader.pages:
|
|
70
|
+
text += page.extract_text() + "\n"
|
|
71
|
+
return text.strip()
|
|
72
|
+
except Exception as e:
|
|
73
|
+
return f"Error extracting PDF: {e}"
|
|
74
|
+
|
|
75
|
+
# Handle Word Documents
|
|
76
|
+
elif 'word' in content_type or 'officedocument.wordprocessingml' in content_type:
|
|
77
|
+
try:
|
|
78
|
+
from docx import Document
|
|
79
|
+
doc = Document(io.BytesIO(content))
|
|
80
|
+
text = "\n".join([para.text for para in doc.paragraphs])
|
|
81
|
+
return text.strip()
|
|
82
|
+
except Exception as e:
|
|
83
|
+
return f"Error extracting Word doc: {e}"
|
|
84
|
+
|
|
85
|
+
return "Unsupported binary format"
|
|
86
|
+
|
|
87
|
+
class SPADetector:
|
|
88
|
+
FRAMEWORK_PATTERNS = {
|
|
89
|
+
'react': [r'react-root', r'_reactRootContainer', r'data-reactid', r'data-reactroot'],
|
|
90
|
+
'vue': [r'v-bind:', r'v-on:', r'__vue__', r'data-v-'],
|
|
91
|
+
'angular': [r'ng-version', r'ng-app', r'ng-controller', r'ng-repeat'],
|
|
92
|
+
'nextjs': [r'__NEXT_DATA__', r'_next/static'],
|
|
93
|
+
'nuxt': [r'__NUXT__'],
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
@staticmethod
|
|
97
|
+
def is_spa(html: str) -> bool:
|
|
98
|
+
"""Detect if the page is likely a Single Page Application."""
|
|
99
|
+
for framework, patterns in SPADetector.FRAMEWORK_PATTERNS.items():
|
|
100
|
+
for pattern in patterns:
|
|
101
|
+
if re.search(pattern, html, re.IGNORECASE):
|
|
102
|
+
return True
|
|
103
|
+
|
|
104
|
+
# Check for empty body with lots of scripts
|
|
105
|
+
body_match = re.search(r'<body[^>]*>(.*?)</body>', html, re.DOTALL | re.IGNORECASE)
|
|
106
|
+
if body_match:
|
|
107
|
+
body_content = body_match.group(1)
|
|
108
|
+
# If body is mostly empty but has many script tags
|
|
109
|
+
clean_body = re.sub(r'<script[^>]*>.*?</script>', '', body_content, flags=re.DOTALL | re.IGNORECASE)
|
|
110
|
+
if len(clean_body.strip()) < 200 and '<script' in body_content:
|
|
111
|
+
return True
|
|
112
|
+
|
|
113
|
+
return False
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from typing import Tuple, Optional
|
|
4
|
+
import requests
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
from .models import CrawlerConfig
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
class BaseFetcher(ABC):
|
|
11
|
+
@abstractmethod
|
|
12
|
+
async def fetch(self, url: str, config: CrawlerConfig) -> Tuple[Optional[Any], int, Optional[str], Optional[str]]:
|
|
13
|
+
"""
|
|
14
|
+
Fetch content from a URL.
|
|
15
|
+
Returns: (content, status_code, title, content_type)
|
|
16
|
+
"""
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
class RequestsFetcher(BaseFetcher):
|
|
20
|
+
async def fetch(self, url: str, config: CrawlerConfig) -> Tuple[Optional[Any], int, Optional[str], Optional[str]]:
|
|
21
|
+
try:
|
|
22
|
+
headers = {"User-Agent": config.user_agent}
|
|
23
|
+
response = requests.get(url, headers=headers, timeout=config.timeout_ms/1000, stream=True)
|
|
24
|
+
content_type = response.headers.get('Content-Type', '').split(';')[0].lower()
|
|
25
|
+
|
|
26
|
+
if response.status_code == 200:
|
|
27
|
+
# Handle text-based content
|
|
28
|
+
if 'html' in content_type or 'text' in content_type:
|
|
29
|
+
soup = BeautifulSoup(response.text, 'lxml')
|
|
30
|
+
title = soup.title.string if soup.title else ""
|
|
31
|
+
return response.text, response.status_code, title, content_type
|
|
32
|
+
|
|
33
|
+
# Handle binary content (PDF, Docx, etc)
|
|
34
|
+
return response.content, response.status_code, url.split('/')[-1], content_type
|
|
35
|
+
|
|
36
|
+
return None, response.status_code, None, content_type
|
|
37
|
+
except Exception as e:
|
|
38
|
+
logger.error(f"RequestsFetcher error for {url}: {e}")
|
|
39
|
+
return None, 0, None, None
|
|
40
|
+
|
|
41
|
+
class PlaywrightFetcher(BaseFetcher):
|
|
42
|
+
def __init__(self):
|
|
43
|
+
self.playwright = None
|
|
44
|
+
self.browser = None
|
|
45
|
+
|
|
46
|
+
async def _ensure_browser(self, config: CrawlerConfig):
|
|
47
|
+
if not self.browser:
|
|
48
|
+
from playwright.async_api import async_playwright
|
|
49
|
+
self.playwright = await async_playwright().start()
|
|
50
|
+
self.browser = await self.playwright.chromium.launch(headless=config.headless)
|
|
51
|
+
|
|
52
|
+
async def fetch(self, url: str, config: CrawlerConfig) -> Tuple[Optional[Any], int, Optional[str], Optional[str]]:
|
|
53
|
+
try:
|
|
54
|
+
await self._ensure_browser(config)
|
|
55
|
+
context = await self.browser.new_context(user_agent=config.user_agent)
|
|
56
|
+
page = await context.new_page()
|
|
57
|
+
|
|
58
|
+
# Playwright is mainly for HTML/SPA, but it can handle navigation to documents
|
|
59
|
+
response = await page.goto(url, wait_until="networkidle", timeout=config.timeout_ms)
|
|
60
|
+
|
|
61
|
+
if not response:
|
|
62
|
+
return None, 0, None, None
|
|
63
|
+
|
|
64
|
+
content_type = response.headers.get('content-type', '').split(';')[0].lower()
|
|
65
|
+
|
|
66
|
+
# If it's a binary file, Playwright might trigger a download or show a PDF viewer
|
|
67
|
+
# For simplicity, we fallback to requests for binary files in the crawler engine,
|
|
68
|
+
# but here we return what we can.
|
|
69
|
+
if 'html' not in content_type:
|
|
70
|
+
# Use raw response body for non-html
|
|
71
|
+
content = await response.body()
|
|
72
|
+
await context.close()
|
|
73
|
+
return content, response.status, url.split('/')[-1], content_type
|
|
74
|
+
|
|
75
|
+
import asyncio
|
|
76
|
+
await asyncio.sleep(config.js_wait_time / 1000)
|
|
77
|
+
|
|
78
|
+
if config.wait_for_selector:
|
|
79
|
+
try:
|
|
80
|
+
await page.wait_for_selector(config.wait_for_selector, timeout=5000)
|
|
81
|
+
except:
|
|
82
|
+
pass
|
|
83
|
+
|
|
84
|
+
html = await page.content()
|
|
85
|
+
status = response.status
|
|
86
|
+
title = await page.title()
|
|
87
|
+
|
|
88
|
+
await context.close()
|
|
89
|
+
return html, status, title, content_type
|
|
90
|
+
except Exception as e:
|
|
91
|
+
logger.error(f"PlaywrightFetcher error for {url}: {e}")
|
|
92
|
+
return None, 0, None, None
|
|
93
|
+
|
|
94
|
+
async def close(self):
|
|
95
|
+
if self.browser:
|
|
96
|
+
await self.browser.close()
|
|
97
|
+
if self.playwright:
|
|
98
|
+
await self.playwright.stop()
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from typing import List, Optional, Dict, Any
|
|
2
|
+
from pydantic import BaseModel, Field, HttpUrl
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
class CrawlerConfig(BaseModel):
|
|
6
|
+
"""Configuration for the Crawler engine."""
|
|
7
|
+
start_url: str
|
|
8
|
+
max_depth: int = Field(default=3, ge=0)
|
|
9
|
+
max_pages: int = Field(default=100, ge=1)
|
|
10
|
+
concurrency: int = Field(default=3, ge=1)
|
|
11
|
+
timeout_ms: int = Field(default=30000, ge=1000)
|
|
12
|
+
rate_limit_delay: float = Field(default=1.0, ge=0.0)
|
|
13
|
+
|
|
14
|
+
# SPA Settings
|
|
15
|
+
use_playwright: bool = True
|
|
16
|
+
headless: bool = True
|
|
17
|
+
wait_for_selector: Optional[str] = None
|
|
18
|
+
js_wait_time: int = 2000
|
|
19
|
+
|
|
20
|
+
# Filtering
|
|
21
|
+
allowed_domains: List[str] = []
|
|
22
|
+
ignore_patterns: List[str] = [
|
|
23
|
+
r"\.(css|js|png|jpg|jpeg|gif|svg|ico|woff|woff2|ttf|eot)$",
|
|
24
|
+
r"^javascript:",
|
|
25
|
+
r"^mailto:",
|
|
26
|
+
r"^tel:",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
# Custom Headers
|
|
30
|
+
user_agent: str = "SitewiseCrawler/0.1.0 (+https://github.com/tarxemo/sitewise-crawler)"
|
|
31
|
+
|
|
32
|
+
class PageData(BaseModel):
|
|
33
|
+
"""Data extracted from a single page."""
|
|
34
|
+
url: str
|
|
35
|
+
title: Optional[str] = None
|
|
36
|
+
content: str
|
|
37
|
+
html: Optional[str] = None
|
|
38
|
+
depth: int
|
|
39
|
+
status_code: int
|
|
40
|
+
is_spa: bool = False
|
|
41
|
+
metadata: Dict[str, Any] = {}
|
|
42
|
+
links: List[str] = []
|
|
43
|
+
timestamp: datetime = Field(default_factory=datetime.now)
|
|
44
|
+
|
|
45
|
+
class CrawlResult(BaseModel):
|
|
46
|
+
"""Summary result of a crawl session."""
|
|
47
|
+
success: bool
|
|
48
|
+
pages_all: List[PageData]
|
|
49
|
+
failed_urls: List[str]
|
|
50
|
+
duration_seconds: float
|
|
51
|
+
total_pages: int
|
|
52
|
+
|
|
53
|
+
class CategoryScore(BaseModel):
|
|
54
|
+
category: str
|
|
55
|
+
score: float # 0.0 to 1.0
|
|
56
|
+
|
|
57
|
+
class Entity(BaseModel):
|
|
58
|
+
name: str
|
|
59
|
+
type: str # Person, Organization, Location, Product, Concept
|
|
60
|
+
frequency: int
|
|
61
|
+
|
|
62
|
+
class UserInsight(BaseModel):
|
|
63
|
+
"""Advanced behavioral analysis based on content consumed. Highly detailed for institutional use."""
|
|
64
|
+
user_id: str
|
|
65
|
+
analyzed_at: datetime = Field(default_factory=datetime.now)
|
|
66
|
+
total_urls_analyzed: int
|
|
67
|
+
|
|
68
|
+
# --- Core Content Analysis ---
|
|
69
|
+
primary_interests: List[str] = []
|
|
70
|
+
top_categories: List[CategoryScore] = []
|
|
71
|
+
content_languages: List[str] = []
|
|
72
|
+
content_complexity_score: float = 0.0 # 0 to 1 (Simple vs Academic/Professional)
|
|
73
|
+
technical_proficiency_level: str = "Unknown" # Beginner, Intermediate, Advanced, Expert
|
|
74
|
+
|
|
75
|
+
# --- Sentiment & Psychological Indicators ---
|
|
76
|
+
overall_sentiment: str # Positive, Neutral, Negative, Mixed
|
|
77
|
+
average_sentiment_score: float # -1.0 to 1.0
|
|
78
|
+
stress_or_anxiety_indicators: float = 0.0 # 0 to 1 (Useful for student/employee wellbeing)
|
|
79
|
+
|
|
80
|
+
# --- Behavioral & Productivity Insights ---
|
|
81
|
+
estimated_intent: str # Research, Information, Entertainment, Transactional, Social
|
|
82
|
+
productivity_rating: float = 0.0 # 0 to 1
|
|
83
|
+
time_wasting_probability: float = 0.0 # 0 to 1 (High if consuming excessive social media/entertainment)
|
|
84
|
+
focus_score: float = 0.0 # 0 to 1 (Is browsing highly concentrated or scattered?)
|
|
85
|
+
|
|
86
|
+
# --- Academic & Career Alignment ---
|
|
87
|
+
educational_alignment: float = 0.0 # 0 to 1
|
|
88
|
+
academic_relevance_score: float = 0.0 # 0 to 1
|
|
89
|
+
career_development_focus: float = 0.0 # 0 to 1
|
|
90
|
+
|
|
91
|
+
# --- Commercial Intent ---
|
|
92
|
+
shopping_intent_score: float = 0.0 # 0 to 1
|
|
93
|
+
likely_in_market_for: List[str] = []
|
|
94
|
+
|
|
95
|
+
# --- Extracted Entities ---
|
|
96
|
+
top_entities: List[Entity] = []
|
|
97
|
+
|
|
98
|
+
# --- Institutional Risk Assessment ---
|
|
99
|
+
risk_assessment_summary: Optional[str] = None
|
|
100
|
+
nsfw_or_inappropriate_probability: float = 0.0 # 0 to 1
|
|
101
|
+
|
|
102
|
+
# --- AI Synthesized Summaries ---
|
|
103
|
+
behavioral_summary: str
|
|
104
|
+
actionable_recommendation: Optional[str] = None # E.g., "User might need study resources for Python"
|
|
105
|
+
|
|
106
|
+
raw_ai_response: Optional[Dict[str, Any]] = None
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sitewise-crawler
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A flexible and advanced web crawler for modern SPAs and traditional websites.
|
|
5
|
+
Author-email: TarXemo <info@tarxemo.com>
|
|
6
|
+
Project-URL: Homepage, https://github.com/tarxemo/sitewise-crawler
|
|
7
|
+
Project-URL: Bug Tracker, https://github.com/tarxemo/sitewise-crawler/issues
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
Requires-Dist: requests
|
|
15
|
+
Requires-Dist: beautifulsoup4
|
|
16
|
+
Requires-Dist: playwright
|
|
17
|
+
Requires-Dist: trafilatura
|
|
18
|
+
Requires-Dist: lxml
|
|
19
|
+
Requires-Dist: pydantic
|
|
20
|
+
Requires-Dist: aiohttp
|
|
21
|
+
Requires-Dist: tenacity
|
|
22
|
+
Requires-Dist: pypdf
|
|
23
|
+
Requires-Dist: python-docx
|
|
24
|
+
Requires-Dist: groq
|
|
25
|
+
|
|
26
|
+
# Sitewise Crawler š·ļø
|
|
27
|
+
|
|
28
|
+
An advanced, flexible, and production-ready web crawler for modern websites. Automatically detects SPAs (Single Page Applications) and switches between fast `requests` fetching and full JavaScript rendering with `Playwright`.
|
|
29
|
+
|
|
30
|
+
## ⨠Features
|
|
31
|
+
|
|
32
|
+
- š **Hybrid Rendering**: Automatically detects React, Vue, Angular, and Next.js to switch rendering engines on the fly.
|
|
33
|
+
- š§ **Smart Extraction**: Built-in main content extraction that removes headers, footers, and sidebars.
|
|
34
|
+
- š **SPA Link Discovery**: Discovers links even in complex client-side routers.
|
|
35
|
+
- š ļø **Fully Configurable**: Control depth, concurrency, rate limits, and custom wait selectors.
|
|
36
|
+
- š **Pydantic Models**: Type-safe configuration and results.
|
|
37
|
+
|
|
38
|
+
## š¦ Installation
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install sitewise-crawler
|
|
42
|
+
playwright install chromium
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## š Quick Start
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
import asyncio
|
|
49
|
+
from sitewise_crawler import SPACrawler, CrawlerConfig
|
|
50
|
+
|
|
51
|
+
async def main():
|
|
52
|
+
# 1. Configure the crawler
|
|
53
|
+
config = CrawlerConfig(
|
|
54
|
+
start_url="https://example.com",
|
|
55
|
+
max_depth=2,
|
|
56
|
+
max_pages=10,
|
|
57
|
+
use_playwright=True,
|
|
58
|
+
headless=True
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# 2. Initialize and run
|
|
62
|
+
crawler = SPACrawler(config)
|
|
63
|
+
|
|
64
|
+
# Optional: Add a callback for each page crawled
|
|
65
|
+
crawler.on_page_crawled = lambda page: print(f"Crawled: {page.url} | Title: {page.title}")
|
|
66
|
+
|
|
67
|
+
result = await crawler.crawl()
|
|
68
|
+
|
|
69
|
+
# 3. Process results
|
|
70
|
+
if result.success:
|
|
71
|
+
print(f"\nā
Crawl complete! Found {result.total_pages} pages.")
|
|
72
|
+
for page in result.pages_all:
|
|
73
|
+
print(f"- {page.url} ({len(page.content)} chars)")
|
|
74
|
+
|
|
75
|
+
if __name__ == "__main__":
|
|
76
|
+
asyncio.run(main())
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## āļø Configuration Options
|
|
80
|
+
|
|
81
|
+
The `CrawlerConfig` class supports the following parameters:
|
|
82
|
+
|
|
83
|
+
| Parameter | Type | Default | Description |
|
|
84
|
+
|-----------|------|---------|-------------|
|
|
85
|
+
| `start_url` | `str` | *Required* | The entry point for the crawler. |
|
|
86
|
+
| `max_depth` | `int` | `3` | Maximum crawl depth from the start URL. |
|
|
87
|
+
| `max_pages` | `int` | `100` | Stop crawling after this many pages. |
|
|
88
|
+
| `use_playwright` | `bool` | `True` | Enable JavaScript rendering for SPAs. |
|
|
89
|
+
| `headless` | `bool` | `True` | Run browser in headless mode. |
|
|
90
|
+
| `rate_limit_delay` | `float` | `1.0` | Seconds to wait between requests. |
|
|
91
|
+
| `wait_for_selector`| `str` | `None` | CSS selector to wait for before extracting SPA content. |
|
|
92
|
+
|
|
93
|
+
## š¤ Contributing
|
|
94
|
+
|
|
95
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
96
|
+
|
|
97
|
+
## š License
|
|
98
|
+
|
|
99
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
100
|
+
# sitewise_crawler
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/sitewise_crawler/__init__.py
|
|
4
|
+
src/sitewise_crawler/analyzer.py
|
|
5
|
+
src/sitewise_crawler/crawler.py
|
|
6
|
+
src/sitewise_crawler/extractors.py
|
|
7
|
+
src/sitewise_crawler/fetchers.py
|
|
8
|
+
src/sitewise_crawler/models.py
|
|
9
|
+
src/sitewise_crawler.egg-info/PKG-INFO
|
|
10
|
+
src/sitewise_crawler.egg-info/SOURCES.txt
|
|
11
|
+
src/sitewise_crawler.egg-info/dependency_links.txt
|
|
12
|
+
src/sitewise_crawler.egg-info/requires.txt
|
|
13
|
+
src/sitewise_crawler.egg-info/top_level.txt
|
|
14
|
+
tests/test_analyzer.py
|
|
15
|
+
tests/test_local.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
sitewise_crawler
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import sys
|
|
3
|
+
import os
|
|
4
|
+
import json
|
|
5
|
+
|
|
6
|
+
# Add src to path for local testing
|
|
7
|
+
sys.path.append(os.path.join(os.getcwd(), 'src'))
|
|
8
|
+
|
|
9
|
+
from sitewise_crawler import InsightEngine, CrawlerConfig
|
|
10
|
+
|
|
11
|
+
async def test_analyzer():
|
|
12
|
+
# Replace with your actual Groq API Key
|
|
13
|
+
api_key = os.getenv("GROQ_API_KEY", "your-groq-api-key")
|
|
14
|
+
|
|
15
|
+
if api_key == "your-groq-api-key":
|
|
16
|
+
print("ā ļø Please set your GROQ_API_KEY environment variable to test the AI analyzer.")
|
|
17
|
+
return
|
|
18
|
+
|
|
19
|
+
print("š Starting InsightEngine test...")
|
|
20
|
+
engine = InsightEngine(api_key=api_key)
|
|
21
|
+
|
|
22
|
+
# We use some fast, public URLs to simulate a user's browsing history
|
|
23
|
+
urls_to_analyze = [
|
|
24
|
+
"https://en.wikipedia.org/wiki/Machine_learning",
|
|
25
|
+
"https://www.python.org/"
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
# We can pass a custom CrawlerConfig to speed up the test (e.g., disable playwright)
|
|
30
|
+
config = CrawlerConfig(start_url=urls_to_analyze[0], max_pages=len(urls_to_analyze), use_playwright=False)
|
|
31
|
+
|
|
32
|
+
insight = await engine.analyze_user_behavior(
|
|
33
|
+
user_id="test_user_001",
|
|
34
|
+
urls=urls_to_analyze,
|
|
35
|
+
crawler_config=config
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
print("\nā
Analysis Complete! Here is the data ready for your database:\n")
|
|
39
|
+
|
|
40
|
+
# Convert to dictionary and print as formatted JSON
|
|
41
|
+
insight_dict = insight.model_dump(mode='json')
|
|
42
|
+
print(json.dumps(insight_dict, indent=2))
|
|
43
|
+
|
|
44
|
+
except Exception as e:
|
|
45
|
+
print(f"ā Error during analysis: {e}")
|
|
46
|
+
|
|
47
|
+
if __name__ == "__main__":
|
|
48
|
+
asyncio.run(test_analyzer())
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import sys
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
# Add src to path for local testing
|
|
6
|
+
sys.path.append(os.path.join(os.getcwd(), 'src'))
|
|
7
|
+
|
|
8
|
+
from sitewise_crawler import SPACrawler, CrawlerConfig
|
|
9
|
+
|
|
10
|
+
async def test_crawl():
|
|
11
|
+
print("š Starting test crawl...")
|
|
12
|
+
config = CrawlerConfig(
|
|
13
|
+
start_url="https://www.google.com", # Fast, traditional site
|
|
14
|
+
max_depth=1,
|
|
15
|
+
max_pages=2,
|
|
16
|
+
use_playwright=False # Keep it fast for testing
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
crawler = SPACrawler(config)
|
|
20
|
+
|
|
21
|
+
def on_page(page):
|
|
22
|
+
print(f"DEBUG: Processed {page.url}")
|
|
23
|
+
|
|
24
|
+
crawler.on_page_crawled = on_page
|
|
25
|
+
|
|
26
|
+
result = await crawler.crawl()
|
|
27
|
+
|
|
28
|
+
print(f"\nSummary:")
|
|
29
|
+
print(f"Success: {result.success}")
|
|
30
|
+
print(f"Pages: {result.total_pages}")
|
|
31
|
+
print(f"Duration: {result.duration_seconds:.2f}s")
|
|
32
|
+
|
|
33
|
+
if result.success:
|
|
34
|
+
for p in result.pages_all:
|
|
35
|
+
print(f"- {p.url} ({p.title})")
|
|
36
|
+
|
|
37
|
+
if __name__ == "__main__":
|
|
38
|
+
asyncio.run(test_crawl())
|