spiderforce4ai 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -86,6 +86,31 @@ class CrawlConfig:
86
86
  payload["remove_selectors_regex"] = self.remove_selectors_regex
87
87
  return payload
88
88
 
89
+
90
+ def _send_webhook_sync(result: CrawlResult, config: CrawlConfig) -> None:
91
+ """Synchronous version of webhook sender for parallel processing."""
92
+ if not config.webhook_url:
93
+ return
94
+
95
+ payload = {
96
+ "url": result.url,
97
+ "status": result.status,
98
+ "markdown": result.markdown if result.status == "success" else None,
99
+ "error": result.error if result.status == "failed" else None,
100
+ "timestamp": result.timestamp,
101
+ "config": config.to_dict()
102
+ }
103
+
104
+ try:
105
+ response = requests.post(
106
+ config.webhook_url,
107
+ json=payload,
108
+ timeout=config.webhook_timeout
109
+ )
110
+ response.raise_for_status()
111
+ except Exception as e:
112
+ print(f"Warning: Failed to send webhook for {result.url}: {str(e)}")
113
+
89
114
  # Module level function for multiprocessing
90
115
  def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
91
116
  """Process a single URL for parallel processing."""
@@ -99,12 +124,15 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
99
124
 
100
125
  response = requests.post(endpoint, json=payload, timeout=config.timeout)
101
126
  if response.status_code != 200:
102
- return CrawlResult(
127
+ result = CrawlResult(
103
128
  url=url,
104
129
  status="failed",
105
130
  error=f"HTTP {response.status_code}: {response.text}",
106
131
  config=config.to_dict()
107
132
  )
133
+ # Send webhook for failed result
134
+ _send_webhook_sync(result, config)
135
+ return result
108
136
 
109
137
  markdown = response.text
110
138
 
@@ -114,24 +142,32 @@ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
114
142
  with open(filepath, 'w', encoding='utf-8') as f:
115
143
  f.write(markdown)
116
144
 
117
- # Add delay if configured
118
- if config.request_delay:
119
- time.sleep(config.request_delay)
120
-
121
- return CrawlResult(
145
+ result = CrawlResult(
122
146
  url=url,
123
147
  status="success",
124
148
  markdown=markdown,
125
149
  config=config.to_dict()
126
150
  )
151
+
152
+ # Send webhook for successful result
153
+ _send_webhook_sync(result, config)
154
+
155
+ # Add delay if configured
156
+ if config.request_delay:
157
+ time.sleep(config.request_delay)
158
+
159
+ return result
127
160
 
128
161
  except Exception as e:
129
- return CrawlResult(
162
+ result = CrawlResult(
130
163
  url=url,
131
164
  status="failed",
132
165
  error=str(e),
133
166
  config=config.to_dict()
134
167
  )
168
+ # Send webhook for error result
169
+ _send_webhook_sync(result, config)
170
+ return result
135
171
 
136
172
  class SpiderForce4AI:
137
173
  """Main class for interacting with SpiderForce4AI service."""
@@ -160,6 +196,113 @@ class SpiderForce4AI:
160
196
  await f.write(markdown)
161
197
  return filepath
162
198
 
199
+
200
+
201
+ def crawl_sitemap_server_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
202
+ """
203
+ Crawl sitemap URLs using server-side parallel processing.
204
+ """
205
+ print(f"Fetching sitemap from {sitemap_url}...")
206
+
207
+ # Fetch sitemap
208
+ try:
209
+ response = requests.get(sitemap_url, timeout=config.timeout)
210
+ response.raise_for_status()
211
+ sitemap_text = response.text
212
+ except Exception as e:
213
+ print(f"Error fetching sitemap: {str(e)}")
214
+ raise
215
+
216
+ # Parse sitemap
217
+ try:
218
+ root = ET.fromstring(sitemap_text)
219
+ namespace = {'ns': root.tag.split('}')[0].strip('{')}
220
+ urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
221
+ print(f"Found {len(urls)} URLs in sitemap")
222
+ except Exception as e:
223
+ print(f"Error parsing sitemap: {str(e)}")
224
+ raise
225
+
226
+ # Process URLs using server-side parallel endpoint
227
+ return self.crawl_urls_server_parallel(urls, config)
228
+
229
+
230
+ def crawl_urls_server_parallel(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
231
+ """
232
+ Crawl multiple URLs using server-side parallel processing.
233
+ This uses the /convert_parallel endpoint which handles parallelization on the server.
234
+ """
235
+ print(f"Sending {len(urls)} URLs for parallel processing...")
236
+
237
+ try:
238
+ endpoint = f"{self.base_url}/convert_parallel"
239
+
240
+ # Prepare payload
241
+ payload = {
242
+ "urls": urls,
243
+ **config.to_dict()
244
+ }
245
+
246
+ # Send request
247
+ response = requests.post(
248
+ endpoint,
249
+ json=payload,
250
+ timeout=config.timeout
251
+ )
252
+ response.raise_for_status()
253
+
254
+ # Process results
255
+ results = []
256
+ server_results = response.json() # Assuming server returns JSON array of results
257
+
258
+ for url_result in server_results:
259
+ result = CrawlResult(
260
+ url=url_result["url"],
261
+ status=url_result.get("status", "failed"),
262
+ markdown=url_result.get("markdown"),
263
+ error=url_result.get("error"),
264
+ config=config.to_dict()
265
+ )
266
+
267
+ # Save markdown if successful and output dir is configured
268
+ if result.status == "success" and config.output_dir and result.markdown:
269
+ filepath = config.output_dir / f"{slugify(result.url)}.md"
270
+ with open(filepath, 'w', encoding='utf-8') as f:
271
+ f.write(result.markdown)
272
+
273
+ # Send webhook if configured
274
+ if config.webhook_url:
275
+ _send_webhook_sync(result, config)
276
+
277
+ results.append(result)
278
+
279
+ # Save report if configured
280
+ if config.report_file:
281
+ self._save_report_sync(results, config)
282
+ print(f"\nReport saved to: {config.report_file}")
283
+
284
+ # Print summary
285
+ successful = len([r for r in results if r.status == "success"])
286
+ failed = len([r for r in results if r.status == "failed"])
287
+ print(f"\nParallel processing completed:")
288
+ print(f"✓ Successful: {successful}")
289
+ print(f"✗ Failed: {failed}")
290
+
291
+ return results
292
+
293
+ except Exception as e:
294
+ print(f"Error during parallel processing: {str(e)}")
295
+ # Create failed results for all URLs
296
+ return [
297
+ CrawlResult(
298
+ url=url,
299
+ status="failed",
300
+ error=str(e),
301
+ config=config.to_dict()
302
+ ) for url in urls
303
+ ]
304
+
305
+
163
306
  async def _send_webhook(self, result: CrawlResult, config: CrawlConfig):
164
307
  """Send webhook with crawl results."""
165
308
  if not config.webhook_url:
@@ -424,4 +567,5 @@ class SpiderForce4AI:
424
567
 
425
568
  def __exit__(self, exc_type, exc_val, exc_tb):
426
569
  """Sync context manager exit."""
427
- self._executor.shutdown(wait=True)
570
+ self._executor.shutdown(wait=True)
571
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 0.1.6
3
+ Version: 0.1.8
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -24,75 +24,73 @@ Dynamic: requires-python
24
24
 
25
25
  # SpiderForce4AI Python Wrapper
26
26
 
27
- A Python wrapper for SpiderForce4AI - a powerful HTML-to-Markdown conversion service. This package provides an easy-to-use interface for crawling websites and converting their content to clean Markdown format.
28
-
29
- ## Installation
30
-
31
- ```bash
32
- pip install spiderforce4ai
33
- ```
27
+ A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
34
28
 
35
29
  ## Quick Start (Minimal Setup)
36
30
 
37
31
  ```python
38
32
  from spiderforce4ai import SpiderForce4AI, CrawlConfig
39
33
 
40
- # Initialize with your SpiderForce4AI service URL
34
+ # Initialize with your service URL
41
35
  spider = SpiderForce4AI("http://localhost:3004")
42
36
 
43
- # Use default configuration (will save in ./spiderforce_reports)
37
+ # Create default config
44
38
  config = CrawlConfig()
45
39
 
46
40
  # Crawl a single URL
47
41
  result = spider.crawl_url("https://example.com", config)
48
42
  ```
49
43
 
44
+ ## Installation
45
+
46
+ ```bash
47
+ pip install spiderforce4ai
48
+ ```
49
+
50
50
  ## Crawling Methods
51
51
 
52
- ### 1. Single URL Crawling
52
+ ### 1. Single URL
53
53
 
54
54
  ```python
55
- # Synchronous
55
+ # Basic usage
56
56
  result = spider.crawl_url("https://example.com", config)
57
57
 
58
- # Asynchronous
58
+ # Async version
59
59
  async def crawl():
60
60
  result = await spider.crawl_url_async("https://example.com", config)
61
61
  ```
62
62
 
63
- ### 2. Multiple URLs Crawling
63
+ ### 2. Multiple URLs
64
64
 
65
65
  ```python
66
- # List of URLs
67
66
  urls = [
68
67
  "https://example.com/page1",
69
- "https://example.com/page2",
70
- "https://example.com/page3"
68
+ "https://example.com/page2"
71
69
  ]
72
70
 
73
- # Synchronous
74
- results = spider.crawl_urls(urls, config)
71
+ # Client-side parallel (using multiprocessing)
72
+ results = spider.crawl_urls_parallel(urls, config)
73
+
74
+ # Server-side parallel (single request)
75
+ results = spider.crawl_urls_server_parallel(urls, config)
75
76
 
76
- # Asynchronous
77
+ # Async version
77
78
  async def crawl():
78
79
  results = await spider.crawl_urls_async(urls, config)
79
-
80
- # Parallel (using multiprocessing)
81
- results = spider.crawl_urls_parallel(urls, config)
82
80
  ```
83
81
 
84
82
  ### 3. Sitemap Crawling
85
83
 
86
84
  ```python
87
- # Synchronous
88
- results = spider.crawl_sitemap("https://example.com/sitemap.xml", config)
85
+ # Server-side parallel (recommended)
86
+ results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
87
+
88
+ # Client-side parallel
89
+ results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
89
90
 
90
- # Asynchronous
91
+ # Async version
91
92
  async def crawl():
92
93
  results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
93
-
94
- # Parallel (using multiprocessing)
95
- results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
96
94
  ```
97
95
 
98
96
  ## Configuration Options
@@ -100,9 +98,11 @@ results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", confi
100
98
  All configuration options are optional with sensible defaults:
101
99
 
102
100
  ```python
101
+ from pathlib import Path
102
+
103
103
  config = CrawlConfig(
104
104
  # Content Selection (all optional)
105
- target_selector="article", # Specific element to target
105
+ target_selector="article", # Specific element to extract
106
106
  remove_selectors=[ # Elements to remove
107
107
  ".ads",
108
108
  "#popup",
@@ -112,21 +112,21 @@ config = CrawlConfig(
112
112
  remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
113
113
 
114
114
  # Processing Settings
115
- max_concurrent_requests=1, # Default: 1 (parallel processing)
116
- request_delay=0.5, # Delay between requests in seconds
117
- timeout=30, # Request timeout in seconds
115
+ max_concurrent_requests=1, # For client-side parallel processing
116
+ request_delay=0.5, # Delay between requests (seconds)
117
+ timeout=30, # Request timeout (seconds)
118
118
 
119
119
  # Output Settings
120
- output_dir="custom_output", # Default: "spiderforce_reports"
121
- report_file="custom_report.json", # Default: "crawl_report.json"
122
- webhook_url="https://your-webhook.com", # Optional webhook endpoint
123
- webhook_timeout=10 # Webhook timeout in seconds
120
+ output_dir=Path("spiderforce_reports"), # Default directory for files
121
+ webhook_url="https://your-webhook.com", # Real-time notifications
122
+ webhook_timeout=10, # Webhook timeout
123
+ report_file=Path("crawl_report.json") # Final report location
124
124
  )
125
125
  ```
126
126
 
127
127
  ## Real-World Examples
128
128
 
129
- ### 1. Basic Website Crawling
129
+ ### 1. Basic Blog Crawling
130
130
 
131
131
  ```python
132
132
  from spiderforce4ai import SpiderForce4AI, CrawlConfig
@@ -134,78 +134,77 @@ from pathlib import Path
134
134
 
135
135
  spider = SpiderForce4AI("http://localhost:3004")
136
136
  config = CrawlConfig(
137
+ target_selector="article.post-content",
137
138
  output_dir=Path("blog_content")
138
139
  )
139
140
 
140
- result = spider.crawl_url("https://example.com/blog", config)
141
- print(f"Content saved to: {result.url}.md")
141
+ result = spider.crawl_url("https://example.com/blog-post", config)
142
142
  ```
143
143
 
144
- ### 2. Advanced Parallel Sitemap Crawling
144
+ ### 2. Parallel Website Crawling
145
145
 
146
146
  ```python
147
147
  config = CrawlConfig(
148
- max_concurrent_requests=5,
149
- output_dir=Path("website_content"),
150
148
  remove_selectors=[
151
149
  ".navigation",
152
150
  ".footer",
153
151
  ".ads",
154
152
  "#cookie-notice"
155
153
  ],
154
+ max_concurrent_requests=5,
155
+ output_dir=Path("website_content"),
156
156
  webhook_url="https://your-webhook.com/endpoint"
157
157
  )
158
158
 
159
- results = spider.crawl_sitemap_parallel(
160
- "https://example.com/sitemap.xml",
161
- config
162
- )
159
+ # Using server-side parallel processing
160
+ results = spider.crawl_urls_server_parallel([
161
+ "https://example.com/page1",
162
+ "https://example.com/page2",
163
+ "https://example.com/page3"
164
+ ], config)
163
165
  ```
164
166
 
165
- ### 3. Async Crawling with Progress
167
+ ### 3. Full Sitemap Processing
166
168
 
167
169
  ```python
168
- import asyncio
169
-
170
- async def main():
171
- config = CrawlConfig(
172
- max_concurrent_requests=3,
173
- request_delay=1.0
174
- )
175
-
176
- async with spider:
177
- results = await spider.crawl_urls_async([
178
- "https://example.com/1",
179
- "https://example.com/2",
180
- "https://example.com/3"
181
- ], config)
182
-
183
- return results
170
+ config = CrawlConfig(
171
+ target_selector="main",
172
+ remove_selectors=[".sidebar", ".comments"],
173
+ output_dir=Path("site_content"),
174
+ report_file=Path("crawl_report.json")
175
+ )
184
176
 
185
- results = asyncio.run(main())
177
+ results = spider.crawl_sitemap_server_parallel(
178
+ "https://example.com/sitemap.xml",
179
+ config
180
+ )
186
181
  ```
187
182
 
188
183
  ## Output Structure
189
184
 
190
- ### 1. File Organization
185
+ ### 1. Directory Layout
191
186
  ```
192
- output_dir/
193
- ├── example-com-page1.md
187
+ spiderforce_reports/ # Default output directory
188
+ ├── example-com-page1.md # Converted markdown files
194
189
  ├── example-com-page2.md
195
- └── crawl_report.json
190
+ └── crawl_report.json # Crawl report
196
191
  ```
197
192
 
198
193
  ### 2. Markdown Files
199
- Each markdown file is named using a slugified version of the URL and contains the converted content.
194
+ Each file is named using a slugified version of the URL:
195
+ ```markdown
196
+ # Page Title
197
+
198
+ Content converted to clean markdown...
199
+ ```
200
200
 
201
- ### 3. Report JSON Structure
201
+ ### 3. Crawl Report
202
202
  ```json
203
203
  {
204
204
  "timestamp": "2025-02-15T10:30:00.123456",
205
205
  "config": {
206
206
  "target_selector": "article",
207
- "remove_selectors": [".ads", "#popup"],
208
- "remove_selectors_regex": ["modal-\\d+"]
207
+ "remove_selectors": [".ads", "#popup"]
209
208
  },
210
209
  "results": {
211
210
  "successful": [
@@ -234,7 +233,7 @@ Each markdown file is named using a slugified version of the URL and contains th
234
233
  ```
235
234
 
236
235
  ### 4. Webhook Notifications
237
- If configured, webhooks receive real-time updates in JSON format:
236
+ If configured, real-time updates are sent for each processed URL:
238
237
  ```json
239
238
  {
240
239
  "url": "https://example.com/page1",
@@ -250,7 +249,7 @@ If configured, webhooks receive real-time updates in JSON format:
250
249
 
251
250
  ## Error Handling
252
251
 
253
- The package handles various types of errors:
252
+ The package handles various types of errors gracefully:
254
253
  - Network errors
255
254
  - Timeout errors
256
255
  - Invalid URLs
@@ -269,6 +268,25 @@ All errors are:
269
268
  - Running SpiderForce4AI service
270
269
  - Internet connection
271
270
 
271
+ ## Performance Considerations
272
+
273
+ 1. Server-side Parallel Processing
274
+ - Best for most cases
275
+ - Single HTTP request for multiple URLs
276
+ - Less network overhead
277
+ - Use: `crawl_urls_server_parallel()` or `crawl_sitemap_server_parallel()`
278
+
279
+ 2. Client-side Parallel Processing
280
+ - Good for special cases requiring local control
281
+ - Uses Python multiprocessing
282
+ - More network overhead
283
+ - Use: `crawl_urls_parallel()` or `crawl_sitemap_parallel()`
284
+
285
+ 3. Async Processing
286
+ - Best for integration with async applications
287
+ - Good for real-time processing
288
+ - Use: `crawl_url_async()`, `crawl_urls_async()`, or `crawl_sitemap_async()`
289
+
272
290
  ## License
273
291
 
274
292
  MIT License
@@ -0,0 +1,5 @@
1
+ spiderforce4ai/__init__.py,sha256=Y_7CfRVYQ2ssH67YexwCV12J14tB125U7WIhVTQfYwU,21652
2
+ spiderforce4ai-0.1.8.dist-info/METADATA,sha256=kXn_kUTsFZm8wtdMt0lTo85Jr3SYAZQzZn_3VL4KkeU,7169
3
+ spiderforce4ai-0.1.8.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
+ spiderforce4ai-0.1.8.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
+ spiderforce4ai-0.1.8.dist-info/RECORD,,
@@ -1,5 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=i1lHYILqFG_Eld0ZCbBdK5F_Jk0zYr_60vS46AYZfTM,16496
2
- spiderforce4ai-0.1.6.dist-info/METADATA,sha256=7rcL1OGqYeF1QHWUIB9xHaKYxGGegs2zHNz0UTu-ego,6575
3
- spiderforce4ai-0.1.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
- spiderforce4ai-0.1.6.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
- spiderforce4ai-0.1.6.dist-info/RECORD,,