webweavex 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,84 @@
1
+ # OS
2
+ .DS_Store
3
+ Thumbs.db
4
+
5
+ # Python
6
+ __pycache__/
7
+ *.py[cod]
8
+ *.pyo
9
+ *.pyd
10
+ *.egg-info/
11
+ .venv/
12
+ .venv-*/
13
+ venv/
14
+ .env
15
+
16
+ # Node
17
+ node_modules/
18
+ .npm/
19
+ .pnpm-store/
20
+ .yarn/
21
+ .yarn-cache/
22
+ .yarnrc.yml
23
+ pnpm-debug.log*
24
+ npm-debug.log*
25
+
26
+ # Java/Kotlin/Gradle
27
+ .gradle/
28
+ build/
29
+ *.class
30
+ *.jar
31
+ *.war
32
+ *.ear
33
+
34
+ # Maven
35
+ target/
36
+
37
+ # Dart/Flutter
38
+ .dart_tool/
39
+ .flutter-plugins
40
+ .flutter-plugins-dependencies
41
+ .packages
42
+ .pub-cache/
43
+ build/
44
+
45
+ # IDE
46
+ .idea/
47
+ .vscode/
48
+ *.iml
49
+
50
+ # Logs
51
+ *.log
52
+
53
+ # Coverage
54
+ coverage/
55
+ htmlcov/
56
+
57
+ # Test artifacts
58
+ .pytest_cache/
59
+
60
+ # Terraform or infra
61
+ *.tfstate
62
+ *.tfstate.backup
63
+
64
+ # Misc
65
+ *.tmp
66
+ *.swp
67
+ .m2/
68
+ .pkg-verify*/
69
+ .tmp_dart_publish_check/
70
+
71
+ # Runtime artifacts
72
+ rag_dataset.jsonl
73
+ knowledge_graph.graphml
74
+ repo_dataset.jsonl
75
+ repo_graph.graphml
76
+ repo_summary.md
77
+ .tmp_site/
78
+ website/.docusaurus/
79
+ website/.npm-cache/
80
+ sdk/node/.npm-cache/
81
+ sdk/node/*.tgz
82
+
83
+ !sdk/dart/bin/
84
+ !sdk/dart/lib/
File without changes
@@ -0,0 +1,180 @@
1
+ Metadata-Version: 2.4
2
+ Name: webweavex
3
+ Version: 0.1.0
4
+ Summary: AI-native web crawling platform
5
+ Author: Piyush Mishra
6
+ License: Apache-2.0
7
+ Keywords: ai,crawler,data extraction,knowledge graph,rag,web scraping
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: License :: OSI Approved :: Apache Software License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Topic :: Internet :: WWW/HTTP
13
+ Classifier: Topic :: Software Development :: Libraries
14
+ Requires-Python: >=3.8
15
+ Requires-Dist: beautifulsoup4>=4.12.0
16
+ Requires-Dist: certifi>=2024.9.0
17
+ Requires-Dist: fastapi>=0.110.0
18
+ Requires-Dist: httpx>=0.25.0
19
+ Requires-Dist: markdownify>=0.13.1
20
+ Requires-Dist: networkx>=3.0
21
+ Requires-Dist: playwright>=1.42.0
22
+ Requires-Dist: pydantic>=2.6.0
23
+ Requires-Dist: redis>=5.0
24
+ Requires-Dist: spacy>=3.7.0
25
+ Requires-Dist: uvicorn>=0.29.0
26
+ Description-Content-Type: text/markdown
27
+
28
+ # WebWeaveX 🚀
29
+
30
+ > The Next-Gen AI-Powered Web Crawling Engine
31
+ > Multi-language SDKs • Structured Data • Knowledge Graphs • RAG-ready
32
+
33
+ ---
34
+
35
+ ## 🌍 Why WebWeaveX?
36
+
37
+ WebWeaveX is not just a crawler.
38
+
39
+ It is a **data intelligence engine** designed for:
40
+
41
+ * 🤖 AI / LLM pipelines (RAG datasets)
42
+ * 📊 Structured web extraction
43
+ * 🧠 Knowledge graph generation
44
+ * ⚡ High-performance crawling
45
+ * 🌐 Multi-language developer ecosystem
46
+
47
+ ---
48
+
49
+ ## ⚡ Features
50
+
51
+ * 🔍 Smart crawling (HTML, metadata, links)
52
+ * 🧠 AI-ready outputs (JSON, text, markdown)
53
+ * 🌐 Multi-language SDKs:
54
+
55
+ * Python (pip)
56
+ * Node.js (npm)
57
+ * Dart (pub.dev)
58
+ * Java (Maven)
59
+ * Kotlin
60
+ * ⚡ Async + high-performance engine
61
+ * 🔐 SSL + secure crawling
62
+ * 📦 CLI + API server
63
+
64
+ ---
65
+
66
+ ## 📦 Installation
67
+
68
+ ```bash
69
+ pip install webweavex
70
+ ```
71
+
72
+ ---
73
+
74
+ ## 🚀 Quick Start
75
+
76
+ ```python
77
+ from webweavex import AsyncWebWeaveX
78
+
79
+ crawler = AsyncWebWeaveX()
80
+ result = crawler.crawl("https://example.com")
81
+
82
+ print(result["metadata"]["title"])
83
+ ```
84
+
85
+ ---
86
+
87
+ ## 🧪 CLI Usage
88
+
89
+ ```bash
90
+ webweavex crawl https://example.com
91
+ ```
92
+
93
+ ---
94
+
95
+ ## 🧠 Output Example
96
+
97
+ ```json
98
+ {
99
+ "url": "https://example.com",
100
+ "status": 200,
101
+ "metadata": {
102
+ "title": "Example Domain"
103
+ },
104
+ "links": [
105
+ {
106
+ "url": "https://iana.org/domains/example",
107
+ "text": "Learn more"
108
+ }
109
+ ]
110
+ }
111
+ ```
112
+
113
+ ---
114
+
115
+ ## 🌐 Multi-Language SDKs
116
+
117
+ WebWeaveX is built for **global developer adoption**:
118
+
119
+ | Language | Package |
120
+ | -------- | --------------------- |
121
+ | Python | pip install webweavex |
122
+ | Node | npm install webweavex |
123
+ | Dart | pub.dev |
124
+ | Java | Maven |
125
+ | Kotlin | Gradle |
126
+
127
+ ---
128
+
129
+ ## ⚡ Benchmarks
130
+
131
+ * ⚡ Fast async crawling
132
+ * 📉 Low memory usage
133
+ * 🔁 Concurrent processing
134
+ * 🔍 Optimized parsing
135
+
136
+ ---
137
+
138
+ ## 🔐 Security
139
+
140
+ * SSL verification enabled by default
141
+ * Safe crawling practices
142
+ * Configurable policies
143
+
144
+ ---
145
+
146
+ ## 🤝 Contributing
147
+
148
+ We welcome contributions 🚀
149
+
150
+ ```bash
151
+ git clone https://github.com/PIYUSH-MISHRA-00/WebWeaveX.git
152
+ ```
153
+
154
+ ---
155
+
156
+ ## 📜 License
157
+
158
+ Apache License 2.0
159
+
160
+ ---
161
+
162
+ ## ⭐ Support
163
+
164
+ If you like this project:
165
+
166
+ 👉 Star the repo
167
+ 👉 Share with developers
168
+ 👉 Use in production
169
+
170
+ ---
171
+
172
+ ## 🚀 Vision
173
+
174
+ WebWeaveX is built for the future of:
175
+
176
+ * AI Agents
177
+ * Autonomous systems
178
+ * Data intelligence platforms
179
+
180
+ ---
@@ -0,0 +1,153 @@
1
+ # WebWeaveX 🚀
2
+
3
+ > The Next-Gen AI-Powered Web Crawling Engine
4
+ > Multi-language SDKs • Structured Data • Knowledge Graphs • RAG-ready
5
+
6
+ ---
7
+
8
+ ## 🌍 Why WebWeaveX?
9
+
10
+ WebWeaveX is not just a crawler.
11
+
12
+ It is a **data intelligence engine** designed for:
13
+
14
+ * 🤖 AI / LLM pipelines (RAG datasets)
15
+ * 📊 Structured web extraction
16
+ * 🧠 Knowledge graph generation
17
+ * ⚡ High-performance crawling
18
+ * 🌐 Multi-language developer ecosystem
19
+
20
+ ---
21
+
22
+ ## ⚡ Features
23
+
24
+ * 🔍 Smart crawling (HTML, metadata, links)
25
+ * 🧠 AI-ready outputs (JSON, text, markdown)
26
+ * 🌐 Multi-language SDKs:
27
+
28
+ * Python (pip)
29
+ * Node.js (npm)
30
+ * Dart (pub.dev)
31
+ * Java (Maven)
32
+ * Kotlin
33
+ * ⚡ Async + high-performance engine
34
+ * 🔐 SSL + secure crawling
35
+ * 📦 CLI + API server
36
+
37
+ ---
38
+
39
+ ## 📦 Installation
40
+
41
+ ```bash
42
+ pip install webweavex
43
+ ```
44
+
45
+ ---
46
+
47
+ ## 🚀 Quick Start
48
+
49
+ ```python
50
+ from webweavex import AsyncWebWeaveX
51
+
52
+ crawler = AsyncWebWeaveX()
53
+ result = crawler.crawl("https://example.com")
54
+
55
+ print(result["metadata"]["title"])
56
+ ```
57
+
58
+ ---
59
+
60
+ ## 🧪 CLI Usage
61
+
62
+ ```bash
63
+ webweavex crawl https://example.com
64
+ ```
65
+
66
+ ---
67
+
68
+ ## 🧠 Output Example
69
+
70
+ ```json
71
+ {
72
+ "url": "https://example.com",
73
+ "status": 200,
74
+ "metadata": {
75
+ "title": "Example Domain"
76
+ },
77
+ "links": [
78
+ {
79
+ "url": "https://iana.org/domains/example",
80
+ "text": "Learn more"
81
+ }
82
+ ]
83
+ }
84
+ ```
85
+
86
+ ---
87
+
88
+ ## 🌐 Multi-Language SDKs
89
+
90
+ WebWeaveX is built for **global developer adoption**:
91
+
92
+ | Language | Package |
93
+ | -------- | --------------------- |
94
+ | Python | pip install webweavex |
95
+ | Node | npm install webweavex |
96
+ | Dart | pub.dev |
97
+ | Java | Maven |
98
+ | Kotlin | Gradle |
99
+
100
+ ---
101
+
102
+ ## ⚡ Benchmarks
103
+
104
+ * ⚡ Fast async crawling
105
+ * 📉 Low memory usage
106
+ * 🔁 Concurrent processing
107
+ * 🔍 Optimized parsing
108
+
109
+ ---
110
+
111
+ ## 🔐 Security
112
+
113
+ * SSL verification enabled by default
114
+ * Safe crawling practices
115
+ * Configurable policies
116
+
117
+ ---
118
+
119
+ ## 🤝 Contributing
120
+
121
+ We welcome contributions 🚀
122
+
123
+ ```bash
124
+ git clone https://github.com/PIYUSH-MISHRA-00/WebWeaveX.git
125
+ ```
126
+
127
+ ---
128
+
129
+ ## 📜 License
130
+
131
+ Apache License 2.0
132
+
133
+ ---
134
+
135
+ ## ⭐ Support
136
+
137
+ If you like this project:
138
+
139
+ 👉 Star the repo
140
+ 👉 Share with developers
141
+ 👉 Use in production
142
+
143
+ ---
144
+
145
+ ## 🚀 Vision
146
+
147
+ WebWeaveX is built for the future of:
148
+
149
+ * AI Agents
150
+ * Autonomous systems
151
+ * Data intelligence platforms
152
+
153
+ ---
@@ -0,0 +1,53 @@
1
+ [build-system]
2
+ requires = ["hatchling>=1.21.0"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "webweavex"
7
+ version = "0.1.0"
8
+ description = "AI-native web crawling platform"
9
+ readme = { file = "README.md", content-type = "text/markdown" }
10
+ requires-python = ">=3.8"
11
+ license = { text = "Apache-2.0" }
12
+
13
+ authors = [
14
+ { name = "Piyush Mishra" }
15
+ ]
16
+
17
+ keywords = [
18
+ "crawler",
19
+ "web scraping",
20
+ "rag",
21
+ "knowledge graph",
22
+ "ai",
23
+ "data extraction"
24
+ ]
25
+
26
+ classifiers = [
27
+ "Programming Language :: Python :: 3",
28
+ "License :: OSI Approved :: Apache Software License",
29
+ "Operating System :: OS Independent",
30
+ "Intended Audience :: Developers",
31
+ "Topic :: Internet :: WWW/HTTP",
32
+ "Topic :: Software Development :: Libraries"
33
+ ]
34
+
35
+ dependencies = [
36
+ "httpx>=0.25.0",
37
+ "beautifulsoup4>=4.12.0",
38
+ "markdownify>=0.13.1",
39
+ "playwright>=1.42.0",
40
+ "pydantic>=2.6.0",
41
+ "fastapi>=0.110.0",
42
+ "uvicorn>=0.29.0",
43
+ "spacy>=3.7.0",
44
+ "redis>=5.0",
45
+ "networkx>=3.0",
46
+ "certifi>=2024.9.0"
47
+ ]
48
+
49
+ [project.scripts]
50
+ webweavex = "webweavex.cli:main"
51
+
52
+ [tool.hatch.build.targets.wheel]
53
+ packages = ["webweavex"]
@@ -0,0 +1,33 @@
1
+ """WebWeaveX core package."""
2
+
3
+ import os
4
+ import ssl
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ # Ensure CA certificates are available for HTTPS requests.
9
+ # Some environments (including certain CI containers) may lack a default CA bundle.
10
+ try:
11
+ import certifi
12
+ cafile = ssl.get_default_verify_paths().cafile
13
+ if not cafile or not os.path.exists(cafile):
14
+ os.environ.setdefault("SSL_CERT_FILE", certifi.where())
15
+ except ImportError: # pragma: no cover
16
+ pass
17
+
18
+ ROOT = Path(__file__).resolve().parents[1] # core
19
+ sys.path.insert(0, str(ROOT.parent)) # root
20
+
21
+ from .async_engine import AsyncWebWeaveX
22
+ from .config import CrawlConfig
23
+ from .engine import WebWeaveX
24
+ from .models import Link, Metadata, PageResult
25
+
26
+ __all__ = [
27
+ "AsyncWebWeaveX",
28
+ "CrawlConfig",
29
+ "WebWeaveX",
30
+ "Link",
31
+ "Metadata",
32
+ "PageResult",
33
+ ]
@@ -0,0 +1,40 @@
1
+ from __future__ import annotations
2
+
3
+ from fastapi import FastAPI
4
+ from pydantic import BaseModel
5
+
6
+ from .async_engine import AsyncWebWeaveX
7
+ from .models import PageResult
8
+
9
+
10
+ class UrlRequest(BaseModel):
11
+ url: str
12
+
13
+
14
+ app = FastAPI(title="WebWeaveX API", version="0.1.0")
15
+ _engine = AsyncWebWeaveX()
16
+
17
+
18
+ @app.on_event("shutdown")
19
+ async def _shutdown_engine() -> None:
20
+ await _engine.aclose()
21
+
22
+
23
+ @app.post("/crawl", response_model=PageResult)
24
+ async def crawl(request: UrlRequest) -> PageResult:
25
+ return await _engine.crawl(request.url)
26
+
27
+
28
+ @app.post("/crawl_site", response_model=list[PageResult])
29
+ async def crawl_site(request: UrlRequest) -> list[PageResult]:
30
+ return await _engine.crawl_site(request.url)
31
+
32
+
33
+ @app.post("/rag_dataset")
34
+ async def rag_dataset(request: UrlRequest) -> list[dict[str, object]]:
35
+ return await _engine.build_rag_dataset(request.url)
36
+
37
+
38
+ @app.post("/knowledge_graph")
39
+ async def knowledge_graph(request: UrlRequest) -> dict[str, list[dict[str, str]]]:
40
+ return await _engine.build_knowledge_graph(request.url)
@@ -0,0 +1,56 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+
5
+ from .async_fetcher import AsyncFetcher
6
+ from .config import CrawlConfig
7
+ from .crawler import parse_html
8
+ from .js_renderer import JSRenderer
9
+ from .logging import get_logger
10
+ from .models import Metadata, PageResult
11
+ from .robots import RobotsHandler
12
+
13
+ logger = get_logger(__name__)
14
+
15
+
16
+ class AsyncCrawler:
17
+ """Concurrent async crawler for multiple URLs."""
18
+
19
+ def __init__(
20
+ self,
21
+ fetcher: AsyncFetcher,
22
+ config: CrawlConfig,
23
+ robots: RobotsHandler | None = None,
24
+ renderer: JSRenderer | None = None,
25
+ ) -> None:
26
+ self._fetcher = fetcher
27
+ self._config = config
28
+ self._robots = robots
29
+ self._renderer = renderer
30
+ self._semaphore = asyncio.Semaphore(max(1, config.max_concurrency))
31
+
32
+ async def crawl(self, url: str) -> PageResult:
33
+ logger.info("Async crawling %s", url)
34
+ if self._robots:
35
+ allowed = await self._robots.allowed(url)
36
+ if not allowed:
37
+ logger.info("Robots blocked %s", url)
38
+ return PageResult(url=url, status=403, html=None, links=[], metadata=Metadata())
39
+
40
+ async with self._semaphore:
41
+ if self._config.enable_js:
42
+ if self._renderer is None:
43
+ logger.warning("JS rendering enabled but no renderer configured; falling back to fetcher")
44
+ status, html = await self._fetcher.fetch(url)
45
+ else:
46
+ logger.info("JS rendering used for %s", url)
47
+ html = await self._renderer.render(url)
48
+ status = 200
49
+ else:
50
+ status, html = await self._fetcher.fetch(url)
51
+ links, metadata = parse_html(html)
52
+ return PageResult(url=url, status=status, html=html, links=links, metadata=metadata)
53
+
54
+ async def crawl_many(self, urls: list[str]) -> list[PageResult]:
55
+ tasks = [asyncio.create_task(self.crawl(url)) for url in urls]
56
+ return await asyncio.gather(*tasks)
@@ -0,0 +1,86 @@
1
+ from __future__ import annotations
2
+
3
+ from .async_crawler import AsyncCrawler
4
+ from .async_fetcher import AsyncFetcher
5
+ from .config import CrawlConfig
6
+ from .js_renderer import JSRenderer
7
+ from .logging import get_logger
8
+ from .models import PageResult
9
+ from .robots import RobotsHandler
10
+
11
+ logger = get_logger(__name__)
12
+
13
+
14
+ class AsyncWebWeaveX:
15
+ """Async entry point for concurrent crawling."""
16
+
17
+ def __init__(
18
+ self,
19
+ config: CrawlConfig | None = None,
20
+ fetcher: AsyncFetcher | None = None,
21
+ renderer: JSRenderer | None = None,
22
+ robots: RobotsHandler | None = None,
23
+ ) -> None:
24
+ self.config = config or CrawlConfig()
25
+ self.fetcher = fetcher or AsyncFetcher(self.config)
26
+ self._owns_fetcher = fetcher is None
27
+ self.renderer = renderer or (JSRenderer(self.config) if self.config.enable_js else None)
28
+ self.robots = robots or RobotsHandler(self.config)
29
+ self._owns_robots = robots is None
30
+ self._crawler = AsyncCrawler(
31
+ self.fetcher,
32
+ self.config,
33
+ robots=self.robots,
34
+ renderer=self.renderer,
35
+ )
36
+ logger.debug("AsyncWebWeaveX engine initialized")
37
+
38
+ async def crawl(self, url: str) -> PageResult:
39
+ logger.info("Async engine crawl requested for %s", url)
40
+ return await self._crawler.crawl(url)
41
+
42
+ async def crawl_many(self, urls: list[str]) -> list[PageResult]:
43
+ logger.info("Async engine crawl_many requested for %s urls", len(urls))
44
+ return await self._crawler.crawl_many(urls)
45
+
46
+ async def crawl_site(self, url: str) -> list[PageResult]:
47
+ logger.info("Async engine site crawl requested for %s", url)
48
+ from crawler_engine.site_crawler import SiteCrawler
49
+ site_crawler = SiteCrawler(
50
+ self.config,
51
+ fetcher=self.fetcher,
52
+ robots=self.robots,
53
+ renderer=self.renderer,
54
+ )
55
+ try:
56
+ return await site_crawler.crawl_site(url)
57
+ finally:
58
+ await site_crawler.aclose()
59
+
60
+ async def build_rag_dataset(self, url: str) -> list[dict[str, object]]:
61
+ logger.info("Dataset generation started for %s", url)
62
+ from rag.rag_pipeline import build_dataset
63
+ pages = await self.crawl_site(url)
64
+ dataset = build_dataset(pages)
65
+ logger.info("Dataset size %s", len(dataset))
66
+ return dataset
67
+
68
+ async def build_knowledge_graph(self, url: str) -> dict[str, list[dict[str, str]]]:
69
+ logger.info("Knowledge graph generation started for %s", url)
70
+ from knowledge_graph.graph_pipeline import build_graph
71
+ pages = await self.crawl_site(url)
72
+ graph = build_graph(pages)
73
+ logger.info("Knowledge graph nodes %s edges %s", len(graph.get("nodes", [])), len(graph.get("edges", [])))
74
+ return graph
75
+
76
+ async def aclose(self) -> None:
77
+ if self._owns_fetcher:
78
+ await self.fetcher.close()
79
+ if self._owns_robots:
80
+ await self.robots.close()
81
+
82
+ async def __aenter__(self) -> "AsyncWebWeaveX":
83
+ return self
84
+
85
+ async def __aexit__(self, exc_type, exc, tb) -> None:
86
+ await self.aclose()