sitewise-crawler 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,100 @@
1
+ Metadata-Version: 2.4
2
+ Name: sitewise-crawler
3
+ Version: 0.1.0
4
+ Summary: A flexible and advanced web crawler for modern SPAs and traditional websites.
5
+ Author-email: TarXemo <info@tarxemo.com>
6
+ Project-URL: Homepage, https://github.com/tarxemo/sitewise-crawler
7
+ Project-URL: Bug Tracker, https://github.com/tarxemo/sitewise-crawler/issues
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
12
+ Requires-Python: >=3.10
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: requests
15
+ Requires-Dist: beautifulsoup4
16
+ Requires-Dist: playwright
17
+ Requires-Dist: trafilatura
18
+ Requires-Dist: lxml
19
+ Requires-Dist: pydantic
20
+ Requires-Dist: aiohttp
21
+ Requires-Dist: tenacity
22
+ Requires-Dist: pypdf
23
+ Requires-Dist: python-docx
24
+ Requires-Dist: groq
25
+
26
+ # Sitewise Crawler šŸ•·ļø
27
+
28
+ An advanced, flexible, and production-ready web crawler for modern websites. Automatically detects SPAs (Single Page Applications) and switches between fast `requests` fetching and full JavaScript rendering with `Playwright`.
29
+
30
+ ## ✨ Features
31
+
32
+ - šŸš€ **Hybrid Rendering**: Automatically detects React, Vue, Angular, and Next.js to switch rendering engines on the fly.
33
+ - 🧠 **Smart Extraction**: Built-in main content extraction that removes headers, footers, and sidebars.
34
+ - šŸ”— **SPA Link Discovery**: Discovers links even in complex client-side routers.
35
+ - šŸ› ļø **Fully Configurable**: Control depth, concurrency, rate limits, and custom wait selectors.
36
+ - šŸ“ **Pydantic Models**: Type-safe configuration and results.
37
+
38
+ ## šŸ“¦ Installation
39
+
40
+ ```bash
41
+ pip install sitewise-crawler
42
+ playwright install chromium
43
+ ```
44
+
45
+ ## šŸš€ Quick Start
46
+
47
+ ```python
48
+ import asyncio
49
+ from sitewise_crawler import SPACrawler, CrawlerConfig
50
+
51
+ async def main():
52
+ # 1. Configure the crawler
53
+ config = CrawlerConfig(
54
+ start_url="https://example.com",
55
+ max_depth=2,
56
+ max_pages=10,
57
+ use_playwright=True,
58
+ headless=True
59
+ )
60
+
61
+ # 2. Initialize and run
62
+ crawler = SPACrawler(config)
63
+
64
+ # Optional: Add a callback for each page crawled
65
+ crawler.on_page_crawled = lambda page: print(f"Crawled: {page.url} | Title: {page.title}")
66
+
67
+ result = await crawler.crawl()
68
+
69
+ # 3. Process results
70
+ if result.success:
71
+ print(f"\nāœ… Crawl complete! Found {result.total_pages} pages.")
72
+ for page in result.pages_all:
73
+ print(f"- {page.url} ({len(page.content)} chars)")
74
+
75
+ if __name__ == "__main__":
76
+ asyncio.run(main())
77
+ ```
78
+
79
+ ## āš™ļø Configuration Options
80
+
81
+ The `CrawlerConfig` class supports the following parameters:
82
+
83
+ | Parameter | Type | Default | Description |
84
+ |-----------|------|---------|-------------|
85
+ | `start_url` | `str` | *Required* | The entry point for the crawler. |
86
+ | `max_depth` | `int` | `3` | Maximum crawl depth from the start URL. |
87
+ | `max_pages` | `int` | `100` | Stop crawling after this many pages. |
88
+ | `use_playwright` | `bool` | `True` | Enable JavaScript rendering for SPAs. |
89
+ | `headless` | `bool` | `True` | Run browser in headless mode. |
90
+ | `rate_limit_delay` | `float` | `1.0` | Seconds to wait between requests. |
91
+ | `wait_for_selector`| `str` | `None` | CSS selector to wait for before extracting SPA content. |
92
+
93
+ ## šŸ¤ Contributing
94
+
95
+ Contributions are welcome! Please feel free to submit a Pull Request.
96
+
97
+ ## šŸ“„ License
98
+
99
+ This project is licensed under the MIT License - see the LICENSE file for details.
100
+ # sitewise_crawler
@@ -0,0 +1,75 @@
1
+ # Sitewise Crawler šŸ•·ļø
2
+
3
+ An advanced, flexible, and production-ready web crawler for modern websites. Automatically detects SPAs (Single Page Applications) and switches between fast `requests` fetching and full JavaScript rendering with `Playwright`.
4
+
5
+ ## ✨ Features
6
+
7
+ - šŸš€ **Hybrid Rendering**: Automatically detects React, Vue, Angular, and Next.js to switch rendering engines on the fly.
8
+ - 🧠 **Smart Extraction**: Built-in main content extraction that removes headers, footers, and sidebars.
9
+ - šŸ”— **SPA Link Discovery**: Discovers links even in complex client-side routers.
10
+ - šŸ› ļø **Fully Configurable**: Control depth, concurrency, rate limits, and custom wait selectors.
11
+ - šŸ“ **Pydantic Models**: Type-safe configuration and results.
12
+
13
+ ## šŸ“¦ Installation
14
+
15
+ ```bash
16
+ pip install sitewise-crawler
17
+ playwright install chromium
18
+ ```
19
+
20
+ ## šŸš€ Quick Start
21
+
22
+ ```python
23
+ import asyncio
24
+ from sitewise_crawler import SPACrawler, CrawlerConfig
25
+
26
+ async def main():
27
+ # 1. Configure the crawler
28
+ config = CrawlerConfig(
29
+ start_url="https://example.com",
30
+ max_depth=2,
31
+ max_pages=10,
32
+ use_playwright=True,
33
+ headless=True
34
+ )
35
+
36
+ # 2. Initialize and run
37
+ crawler = SPACrawler(config)
38
+
39
+ # Optional: Add a callback for each page crawled
40
+ crawler.on_page_crawled = lambda page: print(f"Crawled: {page.url} | Title: {page.title}")
41
+
42
+ result = await crawler.crawl()
43
+
44
+ # 3. Process results
45
+ if result.success:
46
+ print(f"\nāœ… Crawl complete! Found {result.total_pages} pages.")
47
+ for page in result.pages_all:
48
+ print(f"- {page.url} ({len(page.content)} chars)")
49
+
50
+ if __name__ == "__main__":
51
+ asyncio.run(main())
52
+ ```
53
+
54
+ ## āš™ļø Configuration Options
55
+
56
+ The `CrawlerConfig` class supports the following parameters:
57
+
58
+ | Parameter | Type | Default | Description |
59
+ |-----------|------|---------|-------------|
60
+ | `start_url` | `str` | *Required* | The entry point for the crawler. |
61
+ | `max_depth` | `int` | `3` | Maximum crawl depth from the start URL. |
62
+ | `max_pages` | `int` | `100` | Stop crawling after this many pages. |
63
+ | `use_playwright` | `bool` | `True` | Enable JavaScript rendering for SPAs. |
64
+ | `headless` | `bool` | `True` | Run browser in headless mode. |
65
+ | `rate_limit_delay` | `float` | `1.0` | Seconds to wait between requests. |
66
+ | `wait_for_selector`| `str` | `None` | CSS selector to wait for before extracting SPA content. |
67
+
68
+ ## šŸ¤ Contributing
69
+
70
+ Contributions are welcome! Please feel free to submit a Pull Request.
71
+
72
+ ## šŸ“„ License
73
+
74
+ This project is licensed under the MIT License - see the LICENSE file for details.
75
+ # sitewise_crawler
@@ -0,0 +1,39 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "sitewise-crawler"
7
+ version = "0.1.0"
8
+ authors = [
9
+ { name="TarXemo", email="info@tarxemo.com" },
10
+ ]
11
+ description = "A flexible and advanced web crawler for modern SPAs and traditional websites."
12
+ readme = "README.md"
13
+ requires-python = ">=3.10"
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Operating System :: OS Independent",
18
+ "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
19
+ ]
20
+ dependencies = [
21
+ "requests",
22
+ "beautifulsoup4",
23
+ "playwright",
24
+ "trafilatura",
25
+ "lxml",
26
+ "pydantic",
27
+ "aiohttp",
28
+ "tenacity",
29
+ "pypdf",
30
+ "python-docx",
31
+ "groq",
32
+ ]
33
+
34
+ [project.urls]
35
+ "Homepage" = "https://github.com/tarxemo/sitewise-crawler"
36
+ "Bug Tracker" = "https://github.com/tarxemo/sitewise-crawler/issues"
37
+
38
+ [tool.setuptools.packages.find]
39
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,21 @@
1
+ from .crawler import SPACrawler
2
+ from .models import CrawlerConfig, PageData, CrawlResult, UserInsight, CategoryScore
3
+ from .fetchers import RequestsFetcher, PlaywrightFetcher
4
+ from .extractors import LinkExtractor, ContentExtractor, SPADetector
5
+ from .analyzer import InsightEngine
6
+
7
+ __version__ = "0.1.0"
8
+ __all__ = [
9
+ "SPACrawler",
10
+ "CrawlerConfig",
11
+ "PageData",
12
+ "CrawlResult",
13
+ "UserInsight",
14
+ "CategoryScore",
15
+ "RequestsFetcher",
16
+ "PlaywrightFetcher",
17
+ "LinkExtractor",
18
+ "ContentExtractor",
19
+ "SPADetector",
20
+ "InsightEngine",
21
+ ]
@@ -0,0 +1,162 @@
1
+ import logging
2
+ import json
3
+ import asyncio
4
+ from typing import List, Optional, Dict, Any
5
+ from datetime import datetime
6
+ from groq import Groq
7
+ from .crawler import SPACrawler
8
+ from .models import CrawlerConfig, UserInsight, CategoryScore, PageData
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ class InsightEngine:
13
+ """
14
+ Advanced engine for analyzing user behavior based on visited content.
15
+ Uses AI (Groq) to provide deep insights.
16
+ """
17
+ def __init__(self, api_key: str, model: str = "llama-3.1-70b-versatile"):
18
+ self.client = Groq(api_key=api_key)
19
+ self.model = model
20
+
21
+ async def analyze_user_behavior(
22
+ self,
23
+ user_id: str,
24
+ urls: List[str],
25
+ crawler_config: Optional[CrawlerConfig] = None
26
+ ) -> UserInsight:
27
+ """
28
+ Scrapes a list of URLs and performs intensive AI analysis on the content.
29
+ """
30
+ logger.info(f"Starting behavioral analysis for user {user_id} with {len(urls)} URLs")
31
+
32
+ # 1. Scrape all URLs
33
+ if not crawler_config:
34
+ crawler_config = CrawlerConfig(start_url=urls[0], max_pages=len(urls))
35
+
36
+ crawler = SPACrawler(crawler_config)
37
+
38
+ tasks = [crawler.scrape_page(url) for url in urls]
39
+ pages: List[PageData] = await asyncio.gather(*tasks)
40
+ pages = [p for p in pages if p is not None]
41
+
42
+ if not pages:
43
+ raise ValueError("No content could be extracted from the provided URLs.")
44
+
45
+ # 2. Consolidate content for analysis
46
+ # We take chunks of content from each page to stay within LLM context limits
47
+ consolidated_content = ""
48
+ for page in pages:
49
+ preview = page.content[:1500] # Take first 1500 chars from each page
50
+ consolidated_content += f"\n--- URL: {page.url} ---\nTitle: {page.title}\nContent: {preview}\n"
51
+
52
+ # 3. Perform AI Analysis
53
+ analysis_result = await self._call_ai_analyzer(consolidated_content)
54
+
55
+ # 4. Construct UserInsight model
56
+ from .models import Entity
57
+ return UserInsight(
58
+ user_id=user_id,
59
+ total_urls_analyzed=len(pages),
60
+ # Core
61
+ primary_interests=analysis_result.get("primary_interests", []),
62
+ top_categories=[
63
+ CategoryScore(category=c["name"], score=c["score"])
64
+ for c in analysis_result.get("top_categories", [])
65
+ ],
66
+ content_languages=analysis_result.get("content_languages", []),
67
+ content_complexity_score=analysis_result.get("content_complexity_score", 0.0),
68
+ technical_proficiency_level=analysis_result.get("technical_proficiency_level", "Unknown"),
69
+
70
+ # Sentiment
71
+ overall_sentiment=analysis_result.get("overall_sentiment", "Neutral"),
72
+ average_sentiment_score=analysis_result.get("average_sentiment_score", 0.0),
73
+ stress_or_anxiety_indicators=analysis_result.get("stress_or_anxiety_indicators", 0.0),
74
+
75
+ # Behavior
76
+ estimated_intent=analysis_result.get("estimated_intent", "Unknown"),
77
+ productivity_rating=analysis_result.get("productivity_rating", 0.0),
78
+ time_wasting_probability=analysis_result.get("time_wasting_probability", 0.0),
79
+ focus_score=analysis_result.get("focus_score", 0.0),
80
+
81
+ # Academic/Career
82
+ educational_alignment=analysis_result.get("educational_alignment", 0.0),
83
+ academic_relevance_score=analysis_result.get("academic_relevance_score", 0.0),
84
+ career_development_focus=analysis_result.get("career_development_focus", 0.0),
85
+
86
+ # Commercial
87
+ shopping_intent_score=analysis_result.get("shopping_intent_score", 0.0),
88
+ likely_in_market_for=analysis_result.get("likely_in_market_for", []),
89
+
90
+ # Entities
91
+ top_entities=[
92
+ Entity(**e) for e in analysis_result.get("top_entities", [])
93
+ ],
94
+
95
+ # Risk
96
+ risk_assessment_summary=analysis_result.get("risk_assessment_summary"),
97
+ nsfw_or_inappropriate_probability=analysis_result.get("nsfw_or_inappropriate_probability", 0.0),
98
+
99
+ # Summary
100
+ behavioral_summary=analysis_result.get("behavioral_summary", "No summary available."),
101
+ actionable_recommendation=analysis_result.get("actionable_recommendation"),
102
+ raw_ai_response=analysis_result
103
+ )
104
+
105
+ async def _call_ai_analyzer(self, content: str) -> Dict[str, Any]:
106
+ """Calls Groq to analyze the consolidated content."""
107
+ prompt = f"""
108
+ You are a highly advanced Behavioral Profiling AI working for an institutional analytics platform.
109
+ Analyze the following browsing history content and provide a massive, highly detailed behavioral profile of the user.
110
+ Return ONLY a JSON object exactly matching this structure (fill in the values based on your analysis):
111
+ {{
112
+ "primary_interests": ["topic1", "topic2", "topic3"],
113
+ "top_categories": [{{"name": "CategoryName", "score": 0.9}}],
114
+ "content_languages": ["English"],
115
+ "content_complexity_score": 0.8,
116
+ "technical_proficiency_level": "Intermediate",
117
+ "overall_sentiment": "Positive",
118
+ "average_sentiment_score": 0.5,
119
+ "stress_or_anxiety_indicators": 0.1,
120
+ "estimated_intent": "Research",
121
+ "productivity_rating": 0.85,
122
+ "time_wasting_probability": 0.15,
123
+ "focus_score": 0.9,
124
+ "educational_alignment": 0.9,
125
+ "academic_relevance_score": 0.8,
126
+ "career_development_focus": 0.7,
127
+ "shopping_intent_score": 0.0,
128
+ "likely_in_market_for": [],
129
+ "top_entities": [{{"name": "OpenAI", "type": "Organization", "frequency": 5}}],
130
+ "risk_assessment_summary": "Low risk. Professional content.",
131
+ "nsfw_or_inappropriate_probability": 0.0,
132
+ "behavioral_summary": "A comprehensive paragraph summarizing habits.",
133
+ "actionable_recommendation": "Suggest resource X based on interest Y."
134
+ }}
135
+
136
+ Content to analyze:
137
+ {content}
138
+ """
139
+
140
+ try:
141
+ chat_completion = self.client.chat.completions.create(
142
+ messages=[
143
+ {
144
+ "role": "system",
145
+ "content": "You are a specialized User Behavior Analyst. You extract deep, multi-dimensional insights from web content history. Output strictly valid JSON."
146
+ },
147
+ {
148
+ "role": "user",
149
+ "content": prompt,
150
+ }
151
+ ],
152
+ model=self.model,
153
+ response_format={"type": "json_object"}
154
+ )
155
+
156
+ return json.loads(chat_completion.choices[0].message.content)
157
+ except Exception as e:
158
+ logger.error(f"AI Analysis failed: {e}")
159
+ return {
160
+ "behavioral_summary": f"Failed to perform AI analysis: {str(e)}",
161
+ "overall_sentiment": "Unknown"
162
+ }
@@ -0,0 +1,132 @@
1
+ import asyncio
2
+ import logging
3
+ import time
4
+ from collections import deque
5
+ from typing import Set, List, Optional, Callable
6
+ from .models import CrawlerConfig, PageData, CrawlResult
7
+ from .fetchers import RequestsFetcher, PlaywrightFetcher
8
+ from .extractors import LinkExtractor, ContentExtractor, SPADetector
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ class SPACrawler:
13
+ """
14
+ Advanced Crawler Engine that automatically handles SPAs and traditional websites.
15
+ """
16
+ def __init__(self, config: CrawlerConfig):
17
+ self.config = config
18
+ self.visited: Set[str] = set()
19
+ self.queue = deque([(config.start_url, 0)])
20
+ self.results: List[PageData] = []
21
+ self.failed_urls: List[str] = []
22
+
23
+ self.requests_fetcher = RequestsFetcher()
24
+ self.playwright_fetcher = PlaywrightFetcher()
25
+
26
+ # User-defined hook for page processing
27
+ self.on_page_crawled: Optional[Callable[[PageData], None]] = None
28
+
29
+ def _should_crawl(self, url: str, depth: int) -> bool:
30
+ if url in self.visited:
31
+ return False
32
+ if depth > self.config.max_depth:
33
+ return False
34
+ if len(self.results) >= self.config.max_pages:
35
+ return False
36
+ if self.config.allowed_domains and not LinkExtractor.is_same_domain(url, self.config.start_url):
37
+ return False
38
+ return True
39
+
40
+ async def crawl(self) -> CrawlResult:
41
+ """Starts the full BFS crawling process based on config."""
42
+ start_time = time.time()
43
+ logger.info(f"Starting crawl for {self.config.start_url}")
44
+
45
+ while self.queue and len(self.results) < self.config.max_pages:
46
+ url, depth = self.queue.popleft()
47
+
48
+ if not self._should_crawl(url, depth):
49
+ continue
50
+
51
+ self.visited.add(url)
52
+
53
+ # Step 1: Extract the page
54
+ page_data = await self.scrape_page(url, depth=depth)
55
+
56
+ if not page_data:
57
+ self.failed_urls.append(url)
58
+ continue
59
+
60
+ self.results.append(page_data)
61
+
62
+ # Callback
63
+ if self.on_page_crawled:
64
+ self.on_page_crawled(page_data)
65
+
66
+ # Step 2: Add new links to queue (only if we haven't hit max depth)
67
+ if depth < self.config.max_depth:
68
+ for link in page_data.links:
69
+ if LinkExtractor.is_same_domain(link, self.config.start_url):
70
+ self.queue.append((link, depth + 1))
71
+
72
+ # Rate limiting
73
+ await asyncio.sleep(self.config.rate_limit_delay)
74
+
75
+ # Cleanup
76
+ await self.playwright_fetcher.close()
77
+
78
+ duration = time.time() - start_time
79
+ return CrawlResult(
80
+ success=len(self.results) > 0,
81
+ pages_all=self.results,
82
+ failed_urls=self.failed_urls,
83
+ duration_seconds=duration,
84
+ total_pages=len(self.results)
85
+ )
86
+
87
+ async def scrape_page(self, url: str, depth: int = 0) -> Optional[PageData]:
88
+ """
89
+ Directly extracts data from a single URL.
90
+ Supports HTML, SPAs, PDFs, and Word Documents.
91
+ """
92
+ logger.info(f"Scraping page: {url}")
93
+
94
+ # Step 1: Fetch content
95
+ # We start with Requests for efficiency and document handling
96
+ content, status, title, content_type = await self.requests_fetcher.fetch(url, self.config)
97
+
98
+ if not content:
99
+ return None
100
+
101
+ is_spa = False
102
+ is_binary = False
103
+
104
+ # Step 2: Handle based on content type
105
+ if 'html' in content_type:
106
+ # Check for SPA
107
+ if SPADetector.is_spa(content) and self.config.use_playwright:
108
+ logger.info(f"SPA detected for {url}, switching to Playwright")
109
+ is_spa = True
110
+ content, status, title, content_type = await self.playwright_fetcher.fetch(url, self.config)
111
+
112
+ # Extract HTML content
113
+ text_content = ContentExtractor.clean_text(content)
114
+ links = LinkExtractor.extract_links(content, url)
115
+ else:
116
+ # Handle binary documents
117
+ logger.info(f"Binary document detected ({content_type}) for {url}")
118
+ is_binary = True
119
+ text_content = ContentExtractor.extract_from_binary(content, content_type)
120
+ links = [] # Binary files usually don't have crawlable links for our BFS
121
+
122
+ return PageData(
123
+ url=url,
124
+ title=title or url.split('/')[-1],
125
+ content=text_content,
126
+ html=content if (isinstance(content, str) and self.config.max_pages < 10) else None,
127
+ depth=depth,
128
+ status_code=status,
129
+ is_spa=is_spa,
130
+ links=links,
131
+ metadata={'content_type': content_type, 'is_binary': is_binary}
132
+ )
@@ -0,0 +1,113 @@
1
+ import re
2
+ from urllib.parse import urljoin, urlparse, urlunparse
3
+ from typing import List, Set
4
+ from bs4 import BeautifulSoup
5
+ import trafilatura
6
+
7
+ class LinkExtractor:
8
+ @staticmethod
9
+ def normalize_url(url: str, base_url: str) -> str:
10
+ """Normalize URL and remove fragments."""
11
+ parsed = urlparse(url)
12
+ if not parsed.netloc:
13
+ url = urljoin(base_url, url)
14
+ parsed = urlparse(url)
15
+
16
+ # Remove fragments and normalize
17
+ parsed = parsed._replace(fragment='')
18
+ normalized = urlunparse(parsed)
19
+ return normalized.rstrip('/')
20
+
21
+ @staticmethod
22
+ def is_same_domain(url: str, base_url: str) -> bool:
23
+ return urlparse(url).netloc == urlparse(base_url).netloc
24
+
25
+ @staticmethod
26
+ def extract_links(html: str, base_url: str) -> List[str]:
27
+ soup = BeautifulSoup(html, 'lxml')
28
+ links = set()
29
+ for a in soup.find_all('a', href=True):
30
+ links.add(a['href'])
31
+
32
+ normalized = []
33
+ for link in links:
34
+ try:
35
+ norm = LinkExtractor.normalize_url(link, base_url)
36
+ normalized.append(norm)
37
+ except:
38
+ continue
39
+ return list(set(normalized))
40
+
41
+ class ContentExtractor:
42
+ @staticmethod
43
+ def clean_text(html: str) -> str:
44
+ """Extract main content text, removing boilerplates."""
45
+ # Use trafilatura for high-quality extraction
46
+ content = trafilatura.extract(html, include_comments=False, include_tables=True, no_fallback=False)
47
+ if not content:
48
+ # Fallback to BeautifulSoup if trafilatura fails
49
+ soup = BeautifulSoup(html, 'lxml')
50
+ for script in soup(["script", "style", "nav", "footer", "header"]):
51
+ script.decompose()
52
+ content = soup.get_text(separator=' ')
53
+
54
+ # Final cleanup
55
+ content = re.sub(r'\s+', ' ', content).strip()
56
+ return content
57
+
58
+ @staticmethod
59
+ def extract_from_binary(content: bytes, content_type: str) -> str:
60
+ """Extract text from non-HTML binary files (PDF, Docx)."""
61
+ import io
62
+
63
+ # Handle PDF
64
+ if 'pdf' in content_type:
65
+ try:
66
+ from pypdf import PdfReader
67
+ reader = PdfReader(io.BytesIO(content))
68
+ text = ""
69
+ for page in reader.pages:
70
+ text += page.extract_text() + "\n"
71
+ return text.strip()
72
+ except Exception as e:
73
+ return f"Error extracting PDF: {e}"
74
+
75
+ # Handle Word Documents
76
+ elif 'word' in content_type or 'officedocument.wordprocessingml' in content_type:
77
+ try:
78
+ from docx import Document
79
+ doc = Document(io.BytesIO(content))
80
+ text = "\n".join([para.text for para in doc.paragraphs])
81
+ return text.strip()
82
+ except Exception as e:
83
+ return f"Error extracting Word doc: {e}"
84
+
85
+ return "Unsupported binary format"
86
+
87
+ class SPADetector:
88
+ FRAMEWORK_PATTERNS = {
89
+ 'react': [r'react-root', r'_reactRootContainer', r'data-reactid', r'data-reactroot'],
90
+ 'vue': [r'v-bind:', r'v-on:', r'__vue__', r'data-v-'],
91
+ 'angular': [r'ng-version', r'ng-app', r'ng-controller', r'ng-repeat'],
92
+ 'nextjs': [r'__NEXT_DATA__', r'_next/static'],
93
+ 'nuxt': [r'__NUXT__'],
94
+ }
95
+
96
+ @staticmethod
97
+ def is_spa(html: str) -> bool:
98
+ """Detect if the page is likely a Single Page Application."""
99
+ for framework, patterns in SPADetector.FRAMEWORK_PATTERNS.items():
100
+ for pattern in patterns:
101
+ if re.search(pattern, html, re.IGNORECASE):
102
+ return True
103
+
104
+ # Check for empty body with lots of scripts
105
+ body_match = re.search(r'<body[^>]*>(.*?)</body>', html, re.DOTALL | re.IGNORECASE)
106
+ if body_match:
107
+ body_content = body_match.group(1)
108
+ # If body is mostly empty but has many script tags
109
+ clean_body = re.sub(r'<script[^>]*>.*?</script>', '', body_content, flags=re.DOTALL | re.IGNORECASE)
110
+ if len(clean_body.strip()) < 200 and '<script' in body_content:
111
+ return True
112
+
113
+ return False
@@ -0,0 +1,98 @@
1
+ import logging
2
+ from abc import ABC, abstractmethod
3
+ from typing import Tuple, Optional
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+ from .models import CrawlerConfig
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ class BaseFetcher(ABC):
11
+ @abstractmethod
12
+ async def fetch(self, url: str, config: CrawlerConfig) -> Tuple[Optional[Any], int, Optional[str], Optional[str]]:
13
+ """
14
+ Fetch content from a URL.
15
+ Returns: (content, status_code, title, content_type)
16
+ """
17
+ pass
18
+
19
+ class RequestsFetcher(BaseFetcher):
20
+ async def fetch(self, url: str, config: CrawlerConfig) -> Tuple[Optional[Any], int, Optional[str], Optional[str]]:
21
+ try:
22
+ headers = {"User-Agent": config.user_agent}
23
+ response = requests.get(url, headers=headers, timeout=config.timeout_ms/1000, stream=True)
24
+ content_type = response.headers.get('Content-Type', '').split(';')[0].lower()
25
+
26
+ if response.status_code == 200:
27
+ # Handle text-based content
28
+ if 'html' in content_type or 'text' in content_type:
29
+ soup = BeautifulSoup(response.text, 'lxml')
30
+ title = soup.title.string if soup.title else ""
31
+ return response.text, response.status_code, title, content_type
32
+
33
+ # Handle binary content (PDF, Docx, etc)
34
+ return response.content, response.status_code, url.split('/')[-1], content_type
35
+
36
+ return None, response.status_code, None, content_type
37
+ except Exception as e:
38
+ logger.error(f"RequestsFetcher error for {url}: {e}")
39
+ return None, 0, None, None
40
+
41
+ class PlaywrightFetcher(BaseFetcher):
42
+ def __init__(self):
43
+ self.playwright = None
44
+ self.browser = None
45
+
46
+ async def _ensure_browser(self, config: CrawlerConfig):
47
+ if not self.browser:
48
+ from playwright.async_api import async_playwright
49
+ self.playwright = await async_playwright().start()
50
+ self.browser = await self.playwright.chromium.launch(headless=config.headless)
51
+
52
+ async def fetch(self, url: str, config: CrawlerConfig) -> Tuple[Optional[Any], int, Optional[str], Optional[str]]:
53
+ try:
54
+ await self._ensure_browser(config)
55
+ context = await self.browser.new_context(user_agent=config.user_agent)
56
+ page = await context.new_page()
57
+
58
+ # Playwright is mainly for HTML/SPA, but it can handle navigation to documents
59
+ response = await page.goto(url, wait_until="networkidle", timeout=config.timeout_ms)
60
+
61
+ if not response:
62
+ return None, 0, None, None
63
+
64
+ content_type = response.headers.get('content-type', '').split(';')[0].lower()
65
+
66
+ # If it's a binary file, Playwright might trigger a download or show a PDF viewer
67
+ # For simplicity, we fallback to requests for binary files in the crawler engine,
68
+ # but here we return what we can.
69
+ if 'html' not in content_type:
70
+ # Use raw response body for non-html
71
+ content = await response.body()
72
+ await context.close()
73
+ return content, response.status, url.split('/')[-1], content_type
74
+
75
+ import asyncio
76
+ await asyncio.sleep(config.js_wait_time / 1000)
77
+
78
+ if config.wait_for_selector:
79
+ try:
80
+ await page.wait_for_selector(config.wait_for_selector, timeout=5000)
81
+ except:
82
+ pass
83
+
84
+ html = await page.content()
85
+ status = response.status
86
+ title = await page.title()
87
+
88
+ await context.close()
89
+ return html, status, title, content_type
90
+ except Exception as e:
91
+ logger.error(f"PlaywrightFetcher error for {url}: {e}")
92
+ return None, 0, None, None
93
+
94
+ async def close(self):
95
+ if self.browser:
96
+ await self.browser.close()
97
+ if self.playwright:
98
+ await self.playwright.stop()
@@ -0,0 +1,106 @@
1
+ from typing import List, Optional, Dict, Any
2
+ from pydantic import BaseModel, Field, HttpUrl
3
+ from datetime import datetime
4
+
5
+ class CrawlerConfig(BaseModel):
6
+ """Configuration for the Crawler engine."""
7
+ start_url: str
8
+ max_depth: int = Field(default=3, ge=0)
9
+ max_pages: int = Field(default=100, ge=1)
10
+ concurrency: int = Field(default=3, ge=1)
11
+ timeout_ms: int = Field(default=30000, ge=1000)
12
+ rate_limit_delay: float = Field(default=1.0, ge=0.0)
13
+
14
+ # SPA Settings
15
+ use_playwright: bool = True
16
+ headless: bool = True
17
+ wait_for_selector: Optional[str] = None
18
+ js_wait_time: int = 2000
19
+
20
+ # Filtering
21
+ allowed_domains: List[str] = []
22
+ ignore_patterns: List[str] = [
23
+ r"\.(css|js|png|jpg|jpeg|gif|svg|ico|woff|woff2|ttf|eot)$",
24
+ r"^javascript:",
25
+ r"^mailto:",
26
+ r"^tel:",
27
+ ]
28
+
29
+ # Custom Headers
30
+ user_agent: str = "SitewiseCrawler/0.1.0 (+https://github.com/tarxemo/sitewise-crawler)"
31
+
32
+ class PageData(BaseModel):
33
+ """Data extracted from a single page."""
34
+ url: str
35
+ title: Optional[str] = None
36
+ content: str
37
+ html: Optional[str] = None
38
+ depth: int
39
+ status_code: int
40
+ is_spa: bool = False
41
+ metadata: Dict[str, Any] = {}
42
+ links: List[str] = []
43
+ timestamp: datetime = Field(default_factory=datetime.now)
44
+
45
+ class CrawlResult(BaseModel):
46
+ """Summary result of a crawl session."""
47
+ success: bool
48
+ pages_all: List[PageData]
49
+ failed_urls: List[str]
50
+ duration_seconds: float
51
+ total_pages: int
52
+
53
+ class CategoryScore(BaseModel):
54
+ category: str
55
+ score: float # 0.0 to 1.0
56
+
57
+ class Entity(BaseModel):
58
+ name: str
59
+ type: str # Person, Organization, Location, Product, Concept
60
+ frequency: int
61
+
62
+ class UserInsight(BaseModel):
63
+ """Advanced behavioral analysis based on content consumed. Highly detailed for institutional use."""
64
+ user_id: str
65
+ analyzed_at: datetime = Field(default_factory=datetime.now)
66
+ total_urls_analyzed: int
67
+
68
+ # --- Core Content Analysis ---
69
+ primary_interests: List[str] = []
70
+ top_categories: List[CategoryScore] = []
71
+ content_languages: List[str] = []
72
+ content_complexity_score: float = 0.0 # 0 to 1 (Simple vs Academic/Professional)
73
+ technical_proficiency_level: str = "Unknown" # Beginner, Intermediate, Advanced, Expert
74
+
75
+ # --- Sentiment & Psychological Indicators ---
76
+ overall_sentiment: str # Positive, Neutral, Negative, Mixed
77
+ average_sentiment_score: float # -1.0 to 1.0
78
+ stress_or_anxiety_indicators: float = 0.0 # 0 to 1 (Useful for student/employee wellbeing)
79
+
80
+ # --- Behavioral & Productivity Insights ---
81
+ estimated_intent: str # Research, Information, Entertainment, Transactional, Social
82
+ productivity_rating: float = 0.0 # 0 to 1
83
+ time_wasting_probability: float = 0.0 # 0 to 1 (High if consuming excessive social media/entertainment)
84
+ focus_score: float = 0.0 # 0 to 1 (Is browsing highly concentrated or scattered?)
85
+
86
+ # --- Academic & Career Alignment ---
87
+ educational_alignment: float = 0.0 # 0 to 1
88
+ academic_relevance_score: float = 0.0 # 0 to 1
89
+ career_development_focus: float = 0.0 # 0 to 1
90
+
91
+ # --- Commercial Intent ---
92
+ shopping_intent_score: float = 0.0 # 0 to 1
93
+ likely_in_market_for: List[str] = []
94
+
95
+ # --- Extracted Entities ---
96
+ top_entities: List[Entity] = []
97
+
98
+ # --- Institutional Risk Assessment ---
99
+ risk_assessment_summary: Optional[str] = None
100
+ nsfw_or_inappropriate_probability: float = 0.0 # 0 to 1
101
+
102
+ # --- AI Synthesized Summaries ---
103
+ behavioral_summary: str
104
+ actionable_recommendation: Optional[str] = None # E.g., "User might need study resources for Python"
105
+
106
+ raw_ai_response: Optional[Dict[str, Any]] = None
@@ -0,0 +1,100 @@
1
+ Metadata-Version: 2.4
2
+ Name: sitewise-crawler
3
+ Version: 0.1.0
4
+ Summary: A flexible and advanced web crawler for modern SPAs and traditional websites.
5
+ Author-email: TarXemo <info@tarxemo.com>
6
+ Project-URL: Homepage, https://github.com/tarxemo/sitewise-crawler
7
+ Project-URL: Bug Tracker, https://github.com/tarxemo/sitewise-crawler/issues
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
12
+ Requires-Python: >=3.10
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: requests
15
+ Requires-Dist: beautifulsoup4
16
+ Requires-Dist: playwright
17
+ Requires-Dist: trafilatura
18
+ Requires-Dist: lxml
19
+ Requires-Dist: pydantic
20
+ Requires-Dist: aiohttp
21
+ Requires-Dist: tenacity
22
+ Requires-Dist: pypdf
23
+ Requires-Dist: python-docx
24
+ Requires-Dist: groq
25
+
26
+ # Sitewise Crawler šŸ•·ļø
27
+
28
+ An advanced, flexible, and production-ready web crawler for modern websites. Automatically detects SPAs (Single Page Applications) and switches between fast `requests` fetching and full JavaScript rendering with `Playwright`.
29
+
30
+ ## ✨ Features
31
+
32
+ - šŸš€ **Hybrid Rendering**: Automatically detects React, Vue, Angular, and Next.js to switch rendering engines on the fly.
33
+ - 🧠 **Smart Extraction**: Built-in main content extraction that removes headers, footers, and sidebars.
34
+ - šŸ”— **SPA Link Discovery**: Discovers links even in complex client-side routers.
35
+ - šŸ› ļø **Fully Configurable**: Control depth, concurrency, rate limits, and custom wait selectors.
36
+ - šŸ“ **Pydantic Models**: Type-safe configuration and results.
37
+
38
+ ## šŸ“¦ Installation
39
+
40
+ ```bash
41
+ pip install sitewise-crawler
42
+ playwright install chromium
43
+ ```
44
+
45
+ ## šŸš€ Quick Start
46
+
47
+ ```python
48
+ import asyncio
49
+ from sitewise_crawler import SPACrawler, CrawlerConfig
50
+
51
+ async def main():
52
+ # 1. Configure the crawler
53
+ config = CrawlerConfig(
54
+ start_url="https://example.com",
55
+ max_depth=2,
56
+ max_pages=10,
57
+ use_playwright=True,
58
+ headless=True
59
+ )
60
+
61
+ # 2. Initialize and run
62
+ crawler = SPACrawler(config)
63
+
64
+ # Optional: Add a callback for each page crawled
65
+ crawler.on_page_crawled = lambda page: print(f"Crawled: {page.url} | Title: {page.title}")
66
+
67
+ result = await crawler.crawl()
68
+
69
+ # 3. Process results
70
+ if result.success:
71
+ print(f"\nāœ… Crawl complete! Found {result.total_pages} pages.")
72
+ for page in result.pages_all:
73
+ print(f"- {page.url} ({len(page.content)} chars)")
74
+
75
+ if __name__ == "__main__":
76
+ asyncio.run(main())
77
+ ```
78
+
79
+ ## āš™ļø Configuration Options
80
+
81
+ The `CrawlerConfig` class supports the following parameters:
82
+
83
+ | Parameter | Type | Default | Description |
84
+ |-----------|------|---------|-------------|
85
+ | `start_url` | `str` | *Required* | The entry point for the crawler. |
86
+ | `max_depth` | `int` | `3` | Maximum crawl depth from the start URL. |
87
+ | `max_pages` | `int` | `100` | Stop crawling after this many pages. |
88
+ | `use_playwright` | `bool` | `True` | Enable JavaScript rendering for SPAs. |
89
+ | `headless` | `bool` | `True` | Run browser in headless mode. |
90
+ | `rate_limit_delay` | `float` | `1.0` | Seconds to wait between requests. |
91
+ | `wait_for_selector`| `str` | `None` | CSS selector to wait for before extracting SPA content. |
92
+
93
+ ## šŸ¤ Contributing
94
+
95
+ Contributions are welcome! Please feel free to submit a Pull Request.
96
+
97
+ ## šŸ“„ License
98
+
99
+ This project is licensed under the MIT License - see the LICENSE file for details.
100
+ # sitewise_crawler
@@ -0,0 +1,15 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/sitewise_crawler/__init__.py
4
+ src/sitewise_crawler/analyzer.py
5
+ src/sitewise_crawler/crawler.py
6
+ src/sitewise_crawler/extractors.py
7
+ src/sitewise_crawler/fetchers.py
8
+ src/sitewise_crawler/models.py
9
+ src/sitewise_crawler.egg-info/PKG-INFO
10
+ src/sitewise_crawler.egg-info/SOURCES.txt
11
+ src/sitewise_crawler.egg-info/dependency_links.txt
12
+ src/sitewise_crawler.egg-info/requires.txt
13
+ src/sitewise_crawler.egg-info/top_level.txt
14
+ tests/test_analyzer.py
15
+ tests/test_local.py
@@ -0,0 +1,11 @@
1
+ requests
2
+ beautifulsoup4
3
+ playwright
4
+ trafilatura
5
+ lxml
6
+ pydantic
7
+ aiohttp
8
+ tenacity
9
+ pypdf
10
+ python-docx
11
+ groq
@@ -0,0 +1 @@
1
+ sitewise_crawler
@@ -0,0 +1,48 @@
1
+ import asyncio
2
+ import sys
3
+ import os
4
+ import json
5
+
6
+ # Add src to path for local testing
7
+ sys.path.append(os.path.join(os.getcwd(), 'src'))
8
+
9
+ from sitewise_crawler import InsightEngine, CrawlerConfig
10
+
11
+ async def test_analyzer():
12
+ # Replace with your actual Groq API Key
13
+ api_key = os.getenv("GROQ_API_KEY", "your-groq-api-key")
14
+
15
+ if api_key == "your-groq-api-key":
16
+ print("āš ļø Please set your GROQ_API_KEY environment variable to test the AI analyzer.")
17
+ return
18
+
19
+ print("šŸš€ Starting InsightEngine test...")
20
+ engine = InsightEngine(api_key=api_key)
21
+
22
+ # We use some fast, public URLs to simulate a user's browsing history
23
+ urls_to_analyze = [
24
+ "https://en.wikipedia.org/wiki/Machine_learning",
25
+ "https://www.python.org/"
26
+ ]
27
+
28
+ try:
29
+ # We can pass a custom CrawlerConfig to speed up the test (e.g., disable playwright)
30
+ config = CrawlerConfig(start_url=urls_to_analyze[0], max_pages=len(urls_to_analyze), use_playwright=False)
31
+
32
+ insight = await engine.analyze_user_behavior(
33
+ user_id="test_user_001",
34
+ urls=urls_to_analyze,
35
+ crawler_config=config
36
+ )
37
+
38
+ print("\nāœ… Analysis Complete! Here is the data ready for your database:\n")
39
+
40
+ # Convert to dictionary and print as formatted JSON
41
+ insight_dict = insight.model_dump(mode='json')
42
+ print(json.dumps(insight_dict, indent=2))
43
+
44
+ except Exception as e:
45
+ print(f"āŒ Error during analysis: {e}")
46
+
47
+ if __name__ == "__main__":
48
+ asyncio.run(test_analyzer())
@@ -0,0 +1,38 @@
1
+ import asyncio
2
+ import sys
3
+ import os
4
+
5
+ # Add src to path for local testing
6
+ sys.path.append(os.path.join(os.getcwd(), 'src'))
7
+
8
+ from sitewise_crawler import SPACrawler, CrawlerConfig
9
+
10
+ async def test_crawl():
11
+ print("šŸš€ Starting test crawl...")
12
+ config = CrawlerConfig(
13
+ start_url="https://www.google.com", # Fast, traditional site
14
+ max_depth=1,
15
+ max_pages=2,
16
+ use_playwright=False # Keep it fast for testing
17
+ )
18
+
19
+ crawler = SPACrawler(config)
20
+
21
+ def on_page(page):
22
+ print(f"DEBUG: Processed {page.url}")
23
+
24
+ crawler.on_page_crawled = on_page
25
+
26
+ result = await crawler.crawl()
27
+
28
+ print(f"\nSummary:")
29
+ print(f"Success: {result.success}")
30
+ print(f"Pages: {result.total_pages}")
31
+ print(f"Duration: {result.duration_seconds:.2f}s")
32
+
33
+ if result.success:
34
+ for p in result.pages_all:
35
+ print(f"- {p.url} ({p.title})")
36
+
37
+ if __name__ == "__main__":
38
+ asyncio.run(test_crawl())