jwebs 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. jwebs-1.0.0/NOTICE +2 -0
  2. jwebs-1.0.0/PKG-INFO +210 -0
  3. jwebs-1.0.0/README.md +177 -0
  4. jwebs-1.0.0/pyproject.toml +32 -0
  5. jwebs-1.0.0/setup.cfg +4 -0
  6. jwebs-1.0.0/src/jwebs/__init__.py +23 -0
  7. jwebs-1.0.0/src/jwebs/ai.py +328 -0
  8. jwebs-1.0.0/src/jwebs/async_.py +108 -0
  9. jwebs-1.0.0/src/jwebs/captcha.py +99 -0
  10. jwebs-1.0.0/src/jwebs/check.py +397 -0
  11. jwebs-1.0.0/src/jwebs/core/__init__.py +13 -0
  12. jwebs-1.0.0/src/jwebs/core/cache.py +167 -0
  13. jwebs-1.0.0/src/jwebs/core/constants.py +41 -0
  14. jwebs-1.0.0/src/jwebs/core/datatypes.py +248 -0
  15. jwebs-1.0.0/src/jwebs/core/deps.py +50 -0
  16. jwebs-1.0.0/src/jwebs/core/exceptions.py +26 -0
  17. jwebs-1.0.0/src/jwebs/core/http.py +1403 -0
  18. jwebs-1.0.0/src/jwebs/core/http2.py +688 -0
  19. jwebs-1.0.0/src/jwebs/core/logging.py +115 -0
  20. jwebs-1.0.0/src/jwebs/core/ratelimit.py +62 -0
  21. jwebs-1.0.0/src/jwebs/core/robots.py +137 -0
  22. jwebs-1.0.0/src/jwebs/core/session.py +83 -0
  23. jwebs-1.0.0/src/jwebs/core/utils.py +49 -0
  24. jwebs-1.0.0/src/jwebs/crawl.py +233 -0
  25. jwebs-1.0.0/src/jwebs/diff.py +50 -0
  26. jwebs-1.0.0/src/jwebs/extract.py +244 -0
  27. jwebs-1.0.0/src/jwebs/generate.py +37 -0
  28. jwebs-1.0.0/src/jwebs/jwebs.py +1116 -0
  29. jwebs-1.0.0/src/jwebs/monitor.py +94 -0
  30. jwebs-1.0.0/src/jwebs/proxy.py +46 -0
  31. jwebs-1.0.0/src/jwebs/smart.py +128 -0
  32. jwebs-1.0.0/src/jwebs.egg-info/PKG-INFO +210 -0
  33. jwebs-1.0.0/src/jwebs.egg-info/SOURCES.txt +39 -0
  34. jwebs-1.0.0/src/jwebs.egg-info/dependency_links.txt +1 -0
  35. jwebs-1.0.0/src/jwebs.egg-info/requires.txt +27 -0
  36. jwebs-1.0.0/src/jwebs.egg-info/top_level.txt +1 -0
  37. jwebs-1.0.0/tests/test_async_requests.py +16 -0
  38. jwebs-1.0.0/tests/test_basic_extract.py +19 -0
  39. jwebs-1.0.0/tests/test_crawler.py +11 -0
  40. jwebs-1.0.0/tests/test_performance_ping.py +15 -0
  41. jwebs-1.0.0/tests/test_security_seo_audio.py +19 -0
jwebs-1.0.0/NOTICE ADDED
@@ -0,0 +1,2 @@
1
+ jwebs
2
+ Copyright 2026 J Code(Mohammadjavad Maleki Kaveh)
jwebs-1.0.0/PKG-INFO ADDED
@@ -0,0 +1,210 @@
1
+ Metadata-Version: 2.4
2
+ Name: jwebs
3
+ Version: 1.0.0
4
+ Summary: A powerful and advanced web scraping and automation library for Python
5
+ Author: J Code
6
+ Project-URL: Homepage, https://github.com/JCode-JCode/jwebs
7
+ Project-URL: Repository, https://github.com/JCode-JCode/jwebs
8
+ Requires-Python: >=3.8
9
+ Description-Content-Type: text/markdown
10
+ License-File: NOTICE
11
+ Requires-Dist: urllib3>=1.26
12
+ Requires-Dist: beautifulsoup4>=4.12
13
+ Requires-Dist: lxml>=4.9
14
+ Provides-Extra: sentiment
15
+ Requires-Dist: vaderSentiment; extra == "sentiment"
16
+ Provides-Extra: translation
17
+ Requires-Dist: deep-translator; extra == "translation"
18
+ Provides-Extra: brotli
19
+ Requires-Dist: brotli>=1.0.0; extra == "brotli"
20
+ Provides-Extra: distributed
21
+ Requires-Dist: redis>=4.5.0; extra == "distributed"
22
+ Provides-Extra: http2
23
+ Requires-Dist: httpx[http2]>=0.27.0; extra == "http2"
24
+ Provides-Extra: all
25
+ Requires-Dist: vaderSentiment; extra == "all"
26
+ Requires-Dist: deep-translator; extra == "all"
27
+ Requires-Dist: chardet; extra == "all"
28
+ Requires-Dist: charset_normalizer; extra == "all"
29
+ Requires-Dist: brotli>=1.0.0; extra == "all"
30
+ Requires-Dist: redis>=4.5.0; extra == "all"
31
+ Requires-Dist: httpx[http2]>=0.27.0; extra == "all"
32
+ Dynamic: license-file
33
+
34
+ # jwebs
35
+
36
+ [![Python Version](https://img.shields.io/badge/python-3.8%2B-blue)](https://www.python.org/downloads/)
37
+ [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
38
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
39
+ [![PyPI version](https://badge.fury.io/py/jwebs.svg)](https://badge.fury.io/py/jwebs)
40
+
41
+ <br>
42
+
43
+ <img src="docs/images/jwebs-logo.png" alt="jwebs logo">
44
+
45
+ <br>
46
+
47
+ **jwebs** is a complete, high‑performance library for web scraping, crawling automation, and content analysis. It supports both HTTP/1.1 and HTTP/2 (user selectable) and includes built‑in caching, rate limiting, robots.txt handling, dynamic proxy rotation, distributed crawling (via Redis), data extraction, content differencing, uptime monitoring, Sitemap/RSS generation, and optional AI‑powered extraction.
48
+
49
+ ---
50
+
51
+ ## Quick Start – Simple GET Request
52
+
53
+ ```python
54
+ from jwebs import JWebs
55
+
56
+ j = JWebs()
57
+ resp = j.GET("https://example.com")
58
+ print(f"Status: {resp.status}")
59
+ print(f"Content length: {len(resp.text)}")
60
+ ```
61
+
62
+ ---
63
+
64
+ ## Main Capabilities
65
+
66
+ **· HTTP** – HTTP/1.1 and HTTP/2 (user selectable), Keep‑Alive, automatic redirects, batch concurrent requests.
67
+
68
+ **· Request** Management – Two‑layer cache (memory + SQLite), rate limiting (Token Bucket), robots.txt respect, session management.
69
+
70
+ **· Security & Flexibility** – User‑Agent rotation, dynamic proxy rotation, client certificates (mTLS), SSL and security headers checking.
71
+
72
+ **· Crawling & Automation** – Simple crawler and distributed crawler (Redis) that can run across multiple machines.
73
+
74
+ **· Data Extraction** – Extract text, links, emails, phone numbers, prices, JSON‑LD, meta tags, images, social media links.
75
+ · Content Analysis – Sentiment analysis, automatic translation, content differencing (diff).
76
+
77
+ **· Monitoring** – Uptime monitoring, performance testing (TTFB, page size), SEO and security audits.
78
+
79
+ **· Utilities** – Sitemap.xml generator, RSS feed generator, GraphQL client, async client.
80
+
81
+ **· AI** (optional) – Intelligent data extraction via natural language instructions (DeepSeek/OpenAI) and text summarization.
82
+
83
+ ---
84
+
85
+ ## Installation
86
+
87
+ ```bash
88
+ # Basic installation (core dependencies only)
89
+ pip install jwebs
90
+
91
+ # With HTTP/2 support
92
+ pip install jwebs[http2]
93
+
94
+ # With distributed crawler (Redis)
95
+ pip install jwebs[distributed]
96
+
97
+ # All optional features
98
+ pip install jwebs[all]
99
+ ```
100
+ ## Debug
101
+
102
+ If you don't have Redis, install it using your package manager:
103
+
104
+ · Ubuntu/Debian: sudo apt install redis
105
+ · Termux (Android): pkg install redis
106
+ · macOS: brew install redis
107
+
108
+ Or download from redis.io
109
+
110
+ ---
111
+
112
+ ## More Examples
113
+
114
+ ## HTTP/2 and Caching
115
+
116
+ ```python
117
+ from jwebs import JWebs
118
+
119
+ j = JWebs(http_version='2', use_cache=True)
120
+ title = j.GET_TITLE("https://http2.golang.org/")
121
+ print(f"Title: {title}")
122
+ ```
123
+
124
+ Extracting Emails and Links
125
+
126
+ ```python
127
+ from jwebs import JWebs
128
+
129
+ j = JWebs()
130
+ emails = j.EXTRACT_EMAILS("https://example.com")
131
+ links = j.GET_LINKS("https://example.com", internal=True)
132
+ print(f"Emails: {emails}\nInternal Links: {len(links)}")
133
+ ```
134
+
135
+ ## Distributed Crawling with Redis
136
+
137
+ ```python
138
+ from jwebs import JWebs
139
+
140
+ j = JWebs()
141
+ crawler = j.create_distributed_crawler(redis_url="redis://localhost:6379/0")
142
+ crawler.add_seed("https://example.com", depth=0)
143
+ crawler.crawl_worker(max_pages=10, max_depth=2, strict_page_limit=True)
144
+
145
+ results = crawler.get_all_results()
146
+ for url, info in results.items():
147
+ print(f"{url} → {info.get('title', 'no title')}")
148
+ ```
149
+
150
+ ## Security Audit
151
+
152
+ ```python
153
+ from jwebs import JWebs
154
+
155
+ j = JWebs()
156
+ report = j.SECURITY_AUDIT("https://example.com")
157
+ print(f"SSL valid: {report.ssl_valid}")
158
+ print(f"Security grade: {report.grade}")
159
+ ```
160
+
161
+ ## Content Differencing
162
+
163
+ ```python
164
+ from jwebs import JWebs
165
+
166
+ j = JWebs()
167
+ snap1 = j.TAKE_SNAPSHOT("version1", "Hello world")
168
+ snap2 = j.TAKE_SNAPSHOT("version2", "Hello jwebs")
169
+ diff = j.COMPARE_SNAPSHOTS(snap1, snap2)
170
+ print(f"Similarity: {j.SIMILARITY('Hello world', 'Hello jwebs')}")
171
+ ```
172
+
173
+ ## Uptime Monitor
174
+
175
+ ```python
176
+ from jwebs import JWebs
177
+ import time
178
+
179
+ j = JWebs()
180
+ j.MONITOR_URL("https://example.com", expected_status=200)
181
+ j.START_MONITORING()
182
+ time.sleep(5)
183
+ j.STOP_MONITORING()
184
+ ```
185
+
186
+ ---
187
+
188
+ ## Issues and Contributions
189
+
190
+ You can report bugs via GitHub Issues or submit fixes via pull requests.
191
+
192
+ ---
193
+
194
+ ## Links
195
+
196
+ **· GitHub repository:**
197
+ https://github.com/JCode-JCode/jwebs
198
+ **· PyPI page:**
199
+ https://pypi.org/project/jwebs/
200
+
201
+ ---
202
+
203
+ ## License
204
+
205
+ This project is licensed under the Apache License 2.0 – see the LICENSE file for details.
206
+
207
+ ---
208
+
209
+ Designed and built with love by **J Code**
210
+
jwebs-1.0.0/README.md ADDED
@@ -0,0 +1,177 @@
1
+ # jwebs
2
+
3
+ [![Python Version](https://img.shields.io/badge/python-3.8%2B-blue)](https://www.python.org/downloads/)
4
+ [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
5
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
6
+ [![PyPI version](https://badge.fury.io/py/jwebs.svg)](https://badge.fury.io/py/jwebs)
7
+
8
+ <br>
9
+
10
+ <img src="docs/images/jwebs-logo.png" alt="jwebs logo">
11
+
12
+ <br>
13
+
14
+ **jwebs** is a complete, high‑performance library for web scraping, crawling automation, and content analysis. It supports both HTTP/1.1 and HTTP/2 (user selectable) and includes built‑in caching, rate limiting, robots.txt handling, dynamic proxy rotation, distributed crawling (via Redis), data extraction, content differencing, uptime monitoring, Sitemap/RSS generation, and optional AI‑powered extraction.
15
+
16
+ ---
17
+
18
+ ## Quick Start – Simple GET Request
19
+
20
+ ```python
21
+ from jwebs import JWebs
22
+
23
+ j = JWebs()
24
+ resp = j.GET("https://example.com")
25
+ print(f"Status: {resp.status}")
26
+ print(f"Content length: {len(resp.text)}")
27
+ ```
28
+
29
+ ---
30
+
31
+ ## Main Capabilities
32
+
33
+ **· HTTP** – HTTP/1.1 and HTTP/2 (user selectable), Keep‑Alive, automatic redirects, batch concurrent requests.
34
+
35
+ **· Request** Management – Two‑layer cache (memory + SQLite), rate limiting (Token Bucket), robots.txt respect, session management.
36
+
37
+ **· Security & Flexibility** – User‑Agent rotation, dynamic proxy rotation, client certificates (mTLS), SSL and security headers checking.
38
+
39
+ **· Crawling & Automation** – Simple crawler and distributed crawler (Redis) that can run across multiple machines.
40
+
41
+ **· Data Extraction** – Extract text, links, emails, phone numbers, prices, JSON‑LD, meta tags, images, social media links.
42
+ · Content Analysis – Sentiment analysis, automatic translation, content differencing (diff).
43
+
44
+ **· Monitoring** – Uptime monitoring, performance testing (TTFB, page size), SEO and security audits.
45
+
46
+ **· Utilities** – Sitemap.xml generator, RSS feed generator, GraphQL client, async client.
47
+
48
+ **· AI** (optional) – Intelligent data extraction via natural language instructions (DeepSeek/OpenAI) and text summarization.
49
+
50
+ ---
51
+
52
+ ## Installation
53
+
54
+ ```bash
55
+ # Basic installation (core dependencies only)
56
+ pip install jwebs
57
+
58
+ # With HTTP/2 support
59
+ pip install jwebs[http2]
60
+
61
+ # With distributed crawler (Redis)
62
+ pip install jwebs[distributed]
63
+
64
+ # All optional features
65
+ pip install jwebs[all]
66
+ ```
67
+ ## Debug
68
+
69
+ If you don't have Redis, install it using your package manager:
70
+
71
+ · Ubuntu/Debian: sudo apt install redis
72
+ · Termux (Android): pkg install redis
73
+ · macOS: brew install redis
74
+
75
+ Or download from redis.io
76
+
77
+ ---
78
+
79
+ ## More Examples
80
+
81
+ ## HTTP/2 and Caching
82
+
83
+ ```python
84
+ from jwebs import JWebs
85
+
86
+ j = JWebs(http_version='2', use_cache=True)
87
+ title = j.GET_TITLE("https://http2.golang.org/")
88
+ print(f"Title: {title}")
89
+ ```
90
+
91
+ Extracting Emails and Links
92
+
93
+ ```python
94
+ from jwebs import JWebs
95
+
96
+ j = JWebs()
97
+ emails = j.EXTRACT_EMAILS("https://example.com")
98
+ links = j.GET_LINKS("https://example.com", internal=True)
99
+ print(f"Emails: {emails}\nInternal Links: {len(links)}")
100
+ ```
101
+
102
+ ## Distributed Crawling with Redis
103
+
104
+ ```python
105
+ from jwebs import JWebs
106
+
107
+ j = JWebs()
108
+ crawler = j.create_distributed_crawler(redis_url="redis://localhost:6379/0")
109
+ crawler.add_seed("https://example.com", depth=0)
110
+ crawler.crawl_worker(max_pages=10, max_depth=2, strict_page_limit=True)
111
+
112
+ results = crawler.get_all_results()
113
+ for url, info in results.items():
114
+ print(f"{url} → {info.get('title', 'no title')}")
115
+ ```
116
+
117
+ ## Security Audit
118
+
119
+ ```python
120
+ from jwebs import JWebs
121
+
122
+ j = JWebs()
123
+ report = j.SECURITY_AUDIT("https://example.com")
124
+ print(f"SSL valid: {report.ssl_valid}")
125
+ print(f"Security grade: {report.grade}")
126
+ ```
127
+
128
+ ## Content Differencing
129
+
130
+ ```python
131
+ from jwebs import JWebs
132
+
133
+ j = JWebs()
134
+ snap1 = j.TAKE_SNAPSHOT("version1", "Hello world")
135
+ snap2 = j.TAKE_SNAPSHOT("version2", "Hello jwebs")
136
+ diff = j.COMPARE_SNAPSHOTS(snap1, snap2)
137
+ print(f"Similarity: {j.SIMILARITY('Hello world', 'Hello jwebs')}")
138
+ ```
139
+
140
+ ## Uptime Monitor
141
+
142
+ ```python
143
+ from jwebs import JWebs
144
+ import time
145
+
146
+ j = JWebs()
147
+ j.MONITOR_URL("https://example.com", expected_status=200)
148
+ j.START_MONITORING()
149
+ time.sleep(5)
150
+ j.STOP_MONITORING()
151
+ ```
152
+
153
+ ---
154
+
155
+ ## Issues and Contributions
156
+
157
+ You can report bugs via GitHub Issues or submit fixes via pull requests.
158
+
159
+ ---
160
+
161
+ ## Links
162
+
163
+ **· GitHub repository:**
164
+ https://github.com/JCode-JCode/jwebs
165
+ **· PyPI page:**
166
+ https://pypi.org/project/jwebs/
167
+
168
+ ---
169
+
170
+ ## License
171
+
172
+ This project is licensed under the Apache License 2.0 – see the LICENSE file for details.
173
+
174
+ ---
175
+
176
+ Designed and built with love by **J Code**
177
+
@@ -0,0 +1,32 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "jwebs"
7
+ version = "1.0.0"
8
+ description = "A powerful and advanced web scraping and automation library for Python"
9
+ authors = [{name = "J Code"}]
10
+ license = {file = "LICENSE"}
11
+ readme = "README.md"
12
+ requires-python = ">=3.8"
13
+ dependencies = [
14
+ "urllib3>=1.26",
15
+ "beautifulsoup4>=4.12",
16
+ "lxml>=4.9"
17
+ ]
18
+
19
+ [project.urls]
20
+ Homepage = "https://github.com/JCode-JCode/jwebs"
21
+ Repository = "https://github.com/JCode-JCode/jwebs"
22
+
23
+ [project.optional-dependencies]
24
+ sentiment = ["vaderSentiment"]
25
+ translation = ["deep-translator"]
26
+ brotli = ["brotli>=1.0.0"]
27
+ distributed = ["redis>=4.5.0"]
28
+ http2 = ["httpx[http2]>=0.27.0"]
29
+ all = ["vaderSentiment", "deep-translator", "chardet", "charset_normalizer", "brotli>=1.0.0", "redis>=4.5.0", "httpx[http2]>=0.27.0"]
30
+
31
+ [tool.setuptools.packages.find]
32
+ where = ["src"]
jwebs-1.0.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,23 @@
1
+ # Copyright 2026 J Code
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ from .core.http import FastHTTP, HTTPResponse, RequestRecord
4
+ from .core.exceptions import (
5
+ JWebsError, HTTPError, JWebsConnectionError,
6
+ JWebsTimeoutError, RobotsBlockedError, CacheError
7
+ )
8
+ from .check import Checker, SecurityReport, SEOScore, PerformanceMetrics
9
+ from .extract import Builder
10
+ from .crawl import Crawler, DistributedCrawler
11
+ from .ai import AIScrapingEngine, GraphQLClient, GraphQLResponse
12
+ from .captcha import CaptchaSolver, CAPTCHAResult
13
+ from .proxy import ProxyRotator, ProxyConfig
14
+ from .monitor import Monitor
15
+ from .smart import SmartScraper
16
+ from .async_ import AsyncClient, AsyncResponse
17
+ from .diff import ContentDiffer
18
+ from .generate import SitemapGenerator, RSSGenerator
19
+ from .jwebs import JWebs
20
+
21
+ __version__ = "1.0.0"
22
+ __author__ = "J Code"
23
+ __license__ = "Apache-2.0"