spiderforce4ai 1.1__py3-none-any.whl → 1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -350,17 +350,23 @@ class SpiderForce4AI:
350
350
 
351
351
  def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
352
352
  """Save crawl report synchronously."""
353
+ # Separate successful and failed results
354
+ successful_results = [r for r in results if r.status == "success"]
355
+ failed_results = [r for r in results if r.status == "failed"]
356
+
357
+ # Create report with only final state
353
358
  report = {
354
359
  "timestamp": datetime.now().isoformat(),
355
360
  "config": config.to_dict(),
356
361
  "results": {
357
- "successful": [asdict(r) for r in results if r.status == "success"],
358
- "failed": [asdict(r) for r in results if r.status == "failed"]
362
+ "successful": [asdict(r) for r in successful_results],
363
+ "failed": [asdict(r) for r in failed_results] # Only truly failed URLs after retries
359
364
  },
360
365
  "summary": {
361
366
  "total": len(results),
362
- "successful": len([r for r in results if r.status == "success"]),
363
- "failed": len([r for r in results if r.status == "failed"])
367
+ "successful": len(successful_results),
368
+ "failed": len(failed_results),
369
+ "retry_info": getattr(self, '_retry_stats', {}) # Include retry statistics if available
364
370
  }
365
371
  }
366
372
 
@@ -372,17 +378,22 @@ class SpiderForce4AI:
372
378
  if not config.report_file:
373
379
  return
374
380
 
381
+ # Separate successful and failed results
382
+ successful_results = [r for r in self.crawl_results if r.status == "success"]
383
+ failed_results = [r for r in self.crawl_results if r.status == "failed"]
384
+
375
385
  report = {
376
386
  "timestamp": datetime.now().isoformat(),
377
387
  "config": config.to_dict(),
378
388
  "results": {
379
- "successful": [asdict(r) for r in self.crawl_results if r.status == "success"],
380
- "failed": [asdict(r) for r in self.crawl_results if r.status == "failed"]
389
+ "successful": [asdict(r) for r in successful_results],
390
+ "failed": [asdict(r) for r in failed_results] # Only truly failed URLs after retries
381
391
  },
382
392
  "summary": {
383
393
  "total": len(self.crawl_results),
384
- "successful": len([r for r in self.crawl_results if r.status == "success"]),
385
- "failed": len([r for r in self.crawl_results if r.status == "failed"])
394
+ "successful": len(successful_results),
395
+ "failed": len(failed_results),
396
+ "retry_info": getattr(self, '_retry_stats', {}) # Include retry statistics if available
386
397
  }
387
398
  }
388
399
 
@@ -535,8 +546,13 @@ class SpiderForce4AI:
535
546
  results = initial_results
536
547
  else:
537
548
  retry_results = await self._retry_failed_urls(failed_results, config, progress)
538
- # Replace failed results with retry results
539
- results = [r for r in initial_results if r.status == "success"] + retry_results
549
+ # Update results list by replacing failed results with successful retries
550
+ results = initial_results.copy()
551
+ for retry_result in retry_results:
552
+ for i, result in enumerate(results):
553
+ if result.url == retry_result.url:
554
+ results[i] = retry_result
555
+ break
540
556
  else:
541
557
  results = initial_results
542
558
 
@@ -661,12 +677,27 @@ class SpiderForce4AI:
661
677
  console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
662
678
  for result in failed_results:
663
679
  new_result = _process_url_parallel((result.url, self.base_url, config))
680
+
681
+ # Save markdown and trigger webhook for successful retries
664
682
  if new_result.status == "success":
665
683
  console.print(f"[green]✓ Retry successful: {result.url}[/green]")
666
- # Replace the failed result with the successful retry
667
- results[results.index(result)] = new_result
684
+ # Save markdown if output directory is configured
685
+ if config.output_dir and new_result.markdown:
686
+ filepath = config.output_dir / f"{slugify(new_result.url)}.md"
687
+ with open(filepath, 'w', encoding='utf-8') as f:
688
+ f.write(new_result.markdown)
689
+ # Send webhook for successful retry
690
+ _send_webhook_sync(new_result, config)
668
691
  else:
669
692
  console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
693
+ # Send webhook for failed retry
694
+ _send_webhook_sync(new_result, config)
695
+
696
+ # Update results list
697
+ for i, r in enumerate(results):
698
+ if r.url == new_result.url:
699
+ results[i] = new_result
700
+ break
670
701
 
671
702
  # Calculate final statistics
672
703
  final_successful = len([r for r in results if r.status == "success"])
@@ -0,0 +1,298 @@
1
+ Metadata-Version: 2.2
2
+ Name: spiderforce4ai
3
+ Version: 1.3
4
+ Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
+ Home-page: https://petertam.pro
6
+ Author: Piotr Tamulewicz
7
+ Author-email: Piotr Tamulewicz <pt@petertam.pro>
8
+ License: MIT
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Requires-Python: >=3.11
15
+ Description-Content-Type: text/markdown
16
+ Requires-Dist: aiohttp>=3.8.0
17
+ Requires-Dist: asyncio>=3.4.3
18
+ Requires-Dist: rich>=10.0.0
19
+ Requires-Dist: aiofiles>=0.8.0
20
+ Requires-Dist: httpx>=0.24.0
21
+ Dynamic: author
22
+ Dynamic: home-page
23
+ Dynamic: requires-python
24
+
25
+ # SpiderForce4AI Python Wrapper
26
+
27
+ A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
28
+
29
+ ## Features
30
+
31
+ - HTML to Markdown conversion
32
+ - Parallel and async crawling support
33
+ - Sitemap processing
34
+ - Custom content selection
35
+ - Automatic retry mechanism
36
+ - Detailed progress tracking
37
+ - Webhook notifications
38
+ - Customizable reporting
39
+
40
+ ## Installation
41
+
42
+ ```bash
43
+ pip install spiderforce4ai
44
+ ```
45
+
46
+ ## Quick Start
47
+
48
+ ```python
49
+ from spiderforce4ai import SpiderForce4AI, CrawlConfig
50
+ from pathlib import Path
51
+
52
+ # Initialize crawler
53
+ spider = SpiderForce4AI("http://localhost:3004")
54
+
55
+ # Configure crawling options
56
+ config = CrawlConfig(
57
+ target_selector="article",
58
+ remove_selectors=[".ads", ".navigation"],
59
+ max_concurrent_requests=5,
60
+ save_reports=True
61
+ )
62
+
63
+ # Crawl a sitemap
64
+ results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
65
+ ```
66
+
67
+ ## Key Features
68
+
69
+ ### 1. Smart Retry Mechanism
70
+ - Automatically retries failed URLs
71
+ - Monitors failure ratio to prevent server overload
72
+ - Detailed retry statistics and progress tracking
73
+ - Aborts retries if failure rate exceeds 20%
74
+
75
+ ```python
76
+ # Retry behavior is automatic
77
+ config = CrawlConfig(
78
+ max_concurrent_requests=5,
79
+ request_delay=1.0 # Delay between retries
80
+ )
81
+ results = spider.crawl_urls_async(urls, config)
82
+ ```
83
+
84
+ ### 2. Custom Webhook Integration
85
+ - Flexible payload formatting
86
+ - Custom headers support
87
+ - Variable substitution in templates
88
+
89
+ ```python
90
+ config = CrawlConfig(
91
+ webhook_url="https://your-webhook.com",
92
+ webhook_headers={
93
+ "Authorization": "Bearer token",
94
+ "X-Custom-Header": "value"
95
+ },
96
+ webhook_payload_template='''{
97
+ "url": "{url}",
98
+ "content": "{markdown}",
99
+ "status": "{status}",
100
+ "custom_field": "value"
101
+ }'''
102
+ )
103
+ ```
104
+
105
+ ### 3. Flexible Report Generation
106
+ - Optional report saving
107
+ - Customizable report location
108
+ - Detailed success/failure statistics
109
+
110
+ ```python
111
+ config = CrawlConfig(
112
+ save_reports=True,
113
+ report_file=Path("custom_report.json"),
114
+ output_dir=Path("content")
115
+ )
116
+ ```
117
+
118
+ ## Crawling Methods
119
+
120
+ ### 1. Single URL Processing
121
+
122
+ ```python
123
+ # Synchronous
124
+ result = spider.crawl_url("https://example.com", config)
125
+
126
+ # Asynchronous
127
+ async def crawl():
128
+ result = await spider.crawl_url_async("https://example.com", config)
129
+ ```
130
+
131
+ ### 2. Multiple URLs
132
+
133
+ ```python
134
+ urls = ["https://example.com/page1", "https://example.com/page2"]
135
+
136
+ # Server-side parallel (recommended)
137
+ results = spider.crawl_urls_server_parallel(urls, config)
138
+
139
+ # Client-side parallel
140
+ results = spider.crawl_urls_parallel(urls, config)
141
+
142
+ # Asynchronous
143
+ async def crawl():
144
+ results = await spider.crawl_urls_async(urls, config)
145
+ ```
146
+
147
+ ### 3. Sitemap Processing
148
+
149
+ ```python
150
+ # Server-side parallel (recommended)
151
+ results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
152
+
153
+ # Client-side parallel
154
+ results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
155
+
156
+ # Asynchronous
157
+ async def crawl():
158
+ results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
159
+ ```
160
+
161
+ ## Configuration Options
162
+
163
+ ```python
164
+ config = CrawlConfig(
165
+ # Content Selection
166
+ target_selector="article", # Target element to extract
167
+ remove_selectors=[".ads", "#popup"], # Elements to remove
168
+ remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
169
+
170
+ # Processing
171
+ max_concurrent_requests=5, # Parallel processing limit
172
+ request_delay=0.5, # Delay between requests
173
+ timeout=30, # Request timeout
174
+
175
+ # Output
176
+ output_dir=Path("content"), # Output directory
177
+ save_reports=False, # Enable/disable report saving
178
+ report_file=Path("report.json"), # Report location
179
+
180
+ # Webhook
181
+ webhook_url="https://webhook.com", # Webhook endpoint
182
+ webhook_timeout=10, # Webhook timeout
183
+ webhook_headers={ # Custom headers
184
+ "Authorization": "Bearer token"
185
+ },
186
+ webhook_payload_template=''' # Custom payload format
187
+ {
188
+ "url": "{url}",
189
+ "content": "{markdown}",
190
+ "status": "{status}",
191
+ "error": "{error}",
192
+ "time": "{timestamp}"
193
+ }'''
194
+ )
195
+ ```
196
+
197
+ ## Progress Tracking
198
+
199
+ The package provides detailed progress information:
200
+
201
+ ```
202
+ Fetching sitemap from https://example.com/sitemap.xml...
203
+ Found 156 URLs in sitemap
204
+ [━━━━━━━━━━━━━━━━━━━━━━━━━━━━] 100% • 156/156 URLs
205
+
206
+ Retrying failed URLs: 18 (11.5% failed)
207
+ [━━━━━━━━━━━━━━━━━━━━━━━━━━━━] 100% • 18/18 retries
208
+
209
+ Crawling Summary:
210
+ Total URLs processed: 156
211
+ Initial failures: 18 (11.5%)
212
+ Final results:
213
+ ✓ Successful: 150
214
+ ✗ Failed: 6
215
+ Retry success rate: 12/18 (66.7%)
216
+ ```
217
+
218
+ ## Output Structure
219
+
220
+ ### 1. Directory Layout
221
+ ```
222
+ content/ # Output directory
223
+ ├── example-com-page1.md # Markdown files
224
+ ├── example-com-page2.md
225
+ └── report.json # Crawl report
226
+ ```
227
+
228
+ ### 2. Report Format
229
+ ```json
230
+ {
231
+ "timestamp": "2025-02-15T10:30:00",
232
+ "config": {
233
+ "target_selector": "article",
234
+ "remove_selectors": [".ads"]
235
+ },
236
+ "results": {
237
+ "successful": [...],
238
+ "failed": [...]
239
+ },
240
+ "summary": {
241
+ "total": 156,
242
+ "successful": 150,
243
+ "failed": 6
244
+ }
245
+ }
246
+ ```
247
+
248
+ ## Performance Optimization
249
+
250
+ 1. Server-side Parallel Processing
251
+ - Recommended for most cases
252
+ - Single HTTP request
253
+ - Reduced network overhead
254
+ - Built-in load balancing
255
+
256
+ 2. Client-side Parallel Processing
257
+ - Better control over processing
258
+ - Customizable concurrency
259
+ - Progress tracking per URL
260
+ - Automatic retry handling
261
+
262
+ 3. Asynchronous Processing
263
+ - Ideal for async applications
264
+ - Non-blocking operation
265
+ - Real-time progress updates
266
+ - Efficient resource usage
267
+
268
+ ## Error Handling
269
+
270
+ The package provides comprehensive error handling:
271
+
272
+ - Automatic retry for failed URLs
273
+ - Failure ratio monitoring
274
+ - Detailed error reporting
275
+ - Webhook error notifications
276
+ - Progress tracking during retries
277
+
278
+ ## Requirements
279
+
280
+ - Python 3.11+
281
+ - Running SpiderForce4AI service
282
+ - Internet connection
283
+
284
+ ## Dependencies
285
+
286
+ - aiohttp
287
+ - asyncio
288
+ - rich
289
+ - aiofiles
290
+ - httpx
291
+
292
+ ## License
293
+
294
+ MIT License
295
+
296
+ ## Credits
297
+
298
+ Created by [Peter Tam](https://petertam.pro)
@@ -0,0 +1,5 @@
1
+ spiderforce4ai/__init__.py,sha256=Fbgqu9uPg0wuWZgiVYNTv6CkkcOHgU_f5-uoXRKhgn4,29922
2
+ spiderforce4ai-1.3.dist-info/METADATA,sha256=ybuwcVE62JFnWJKcCdHDrOOqmbuh8PEzF69_yFK-eV0,7183
3
+ spiderforce4ai-1.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
+ spiderforce4ai-1.3.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
+ spiderforce4ai-1.3.dist-info/RECORD,,
@@ -1,309 +0,0 @@
1
- Metadata-Version: 2.2
2
- Name: spiderforce4ai
3
- Version: 1.1
4
- Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
- Home-page: https://petertam.pro
6
- Author: Piotr Tamulewicz
7
- Author-email: Piotr Tamulewicz <pt@petertam.pro>
8
- License: MIT
9
- Classifier: Development Status :: 4 - Beta
10
- Classifier: Intended Audience :: Developers
11
- Classifier: License :: OSI Approved :: MIT License
12
- Classifier: Programming Language :: Python :: 3.11
13
- Classifier: Programming Language :: Python :: 3.12
14
- Requires-Python: >=3.11
15
- Description-Content-Type: text/markdown
16
- Requires-Dist: aiohttp>=3.8.0
17
- Requires-Dist: asyncio>=3.4.3
18
- Requires-Dist: rich>=10.0.0
19
- Requires-Dist: aiofiles>=0.8.0
20
- Requires-Dist: httpx>=0.24.0
21
- Dynamic: author
22
- Dynamic: home-page
23
- Dynamic: requires-python
24
-
25
- # SpiderForce4AI Python Wrapper
26
-
27
- A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
28
-
29
- ## Quick Start (Minimal Setup)
30
-
31
- ```python
32
- from spiderforce4ai import SpiderForce4AI, CrawlConfig
33
-
34
- # Initialize with your service URL
35
- spider = SpiderForce4AI("http://localhost:3004")
36
-
37
- # Create default config
38
- config = CrawlConfig()
39
-
40
- # Crawl a single URL
41
- result = spider.crawl_url("https://example.com", config)
42
- ```
43
-
44
- ## Installation
45
-
46
- ```bash
47
- pip install spiderforce4ai
48
- ```
49
-
50
- ## Crawling Methods
51
-
52
- ### 1. Single URL
53
-
54
- ```python
55
- # Basic usage
56
- result = spider.crawl_url("https://example.com", config)
57
-
58
- # Async version
59
- async def crawl():
60
- result = await spider.crawl_url_async("https://example.com", config)
61
- ```
62
-
63
- ### 2. Multiple URLs
64
-
65
- ```python
66
- urls = [
67
- "https://example.com/page1",
68
- "https://example.com/page2"
69
- ]
70
-
71
- # Client-side parallel (using multiprocessing)
72
- results = spider.crawl_urls_parallel(urls, config)
73
-
74
- # Server-side parallel (single request)
75
- results = spider.crawl_urls_server_parallel(urls, config)
76
-
77
- # Async version
78
- async def crawl():
79
- results = await spider.crawl_urls_async(urls, config)
80
- ```
81
-
82
- ### 3. Sitemap Crawling
83
-
84
- ```python
85
- # Server-side parallel (recommended)
86
- results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
87
-
88
- # Client-side parallel
89
- results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
90
-
91
- # Async version
92
- async def crawl():
93
- results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
94
- ```
95
-
96
- ## Configuration Options
97
-
98
- All configuration options are optional with sensible defaults:
99
-
100
- ```python
101
- from pathlib import Path
102
-
103
- config = CrawlConfig(
104
- # Content Selection (all optional)
105
- target_selector="article", # Specific element to extract
106
- remove_selectors=[ # Elements to remove
107
- ".ads",
108
- "#popup",
109
- ".navigation",
110
- ".footer"
111
- ],
112
- remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
113
-
114
- # Processing Settings
115
- max_concurrent_requests=1, # For client-side parallel processing
116
- request_delay=0.5, # Delay between requests (seconds)
117
- timeout=30, # Request timeout (seconds)
118
-
119
- # Output Settings
120
- output_dir=Path("spiderforce_reports"), # Default directory for files
121
- webhook_url="https://your-webhook.com", # Real-time notifications
122
- webhook_timeout=10, # Webhook timeout
123
- webhook_headers={ # Optional custom headers for webhook
124
- "Authorization": "Bearer your-token",
125
- "X-Custom-Header": "value"
126
- },
127
- webhook_payload_template='''{ # Optional custom webhook payload template
128
- "crawled_url": "{url}",
129
- "content": "{markdown}",
130
- "crawl_status": "{status}",
131
- "crawl_error": "{error}",
132
- "crawl_time": "{timestamp}",
133
- "custom_field": "your-value"
134
- }''',
135
- save_reports=False, # Whether to save crawl reports (default: False)
136
- report_file=Path("crawl_report.json") # Report location (used only if save_reports=True)
137
- )
138
- ```
139
-
140
- ## Real-World Examples
141
-
142
- ### 1. Basic Blog Crawling
143
-
144
- ```python
145
- from spiderforce4ai import SpiderForce4AI, CrawlConfig
146
- from pathlib import Path
147
-
148
- spider = SpiderForce4AI("http://localhost:3004")
149
- config = CrawlConfig(
150
- target_selector="article.post-content",
151
- output_dir=Path("blog_content")
152
- )
153
-
154
- result = spider.crawl_url("https://example.com/blog-post", config)
155
- ```
156
-
157
- ### 2. Parallel Website Crawling
158
-
159
- ```python
160
- config = CrawlConfig(
161
- remove_selectors=[
162
- ".navigation",
163
- ".footer",
164
- ".ads",
165
- "#cookie-notice"
166
- ],
167
- max_concurrent_requests=5,
168
- output_dir=Path("website_content"),
169
- webhook_url="https://your-webhook.com/endpoint"
170
- )
171
-
172
- # Using server-side parallel processing
173
- results = spider.crawl_urls_server_parallel([
174
- "https://example.com/page1",
175
- "https://example.com/page2",
176
- "https://example.com/page3"
177
- ], config)
178
- ```
179
-
180
- ### 3. Full Sitemap Processing
181
-
182
- ```python
183
- config = CrawlConfig(
184
- target_selector="main",
185
- remove_selectors=[".sidebar", ".comments"],
186
- output_dir=Path("site_content"),
187
- report_file=Path("crawl_report.json")
188
- )
189
-
190
- results = spider.crawl_sitemap_server_parallel(
191
- "https://example.com/sitemap.xml",
192
- config
193
- )
194
- ```
195
-
196
- ## Output Structure
197
-
198
- ### 1. Directory Layout
199
- ```
200
- spiderforce_reports/ # Default output directory
201
- ├── example-com-page1.md # Converted markdown files
202
- ├── example-com-page2.md
203
- └── crawl_report.json # Crawl report
204
- ```
205
-
206
- ### 2. Markdown Files
207
- Each file is named using a slugified version of the URL:
208
- ```markdown
209
- # Page Title
210
-
211
- Content converted to clean markdown...
212
- ```
213
-
214
- ### 3. Crawl Report
215
- ```json
216
- {
217
- "timestamp": "2025-02-15T10:30:00.123456",
218
- "config": {
219
- "target_selector": "article",
220
- "remove_selectors": [".ads", "#popup"]
221
- },
222
- "results": {
223
- "successful": [
224
- {
225
- "url": "https://example.com/page1",
226
- "status": "success",
227
- "markdown": "# Page Title\n\nContent...",
228
- "timestamp": "2025-02-15T10:30:00.123456"
229
- }
230
- ],
231
- "failed": [
232
- {
233
- "url": "https://example.com/page2",
234
- "status": "failed",
235
- "error": "HTTP 404: Not Found",
236
- "timestamp": "2025-02-15T10:30:01.123456"
237
- }
238
- ]
239
- },
240
- "summary": {
241
- "total": 2,
242
- "successful": 1,
243
- "failed": 1
244
- }
245
- }
246
- ```
247
-
248
- ### 4. Webhook Notifications
249
- If configured, real-time updates are sent for each processed URL:
250
- ```json
251
- {
252
- "url": "https://example.com/page1",
253
- "status": "success",
254
- "markdown": "# Page Title\n\nContent...",
255
- "timestamp": "2025-02-15T10:30:00.123456",
256
- "config": {
257
- "target_selector": "article",
258
- "remove_selectors": [".ads", "#popup"]
259
- }
260
- }
261
- ```
262
-
263
- ## Error Handling
264
-
265
- The package handles various types of errors gracefully:
266
- - Network errors
267
- - Timeout errors
268
- - Invalid URLs
269
- - Missing content
270
- - Service errors
271
-
272
- All errors are:
273
- 1. Logged in the console
274
- 2. Included in the JSON report
275
- 3. Sent via webhook (if configured)
276
- 4. Available in the results list
277
-
278
- ## Requirements
279
-
280
- - Python 3.11 or later
281
- - Running SpiderForce4AI service
282
- - Internet connection
283
-
284
- ## Performance Considerations
285
-
286
- 1. Server-side Parallel Processing
287
- - Best for most cases
288
- - Single HTTP request for multiple URLs
289
- - Less network overhead
290
- - Use: `crawl_urls_server_parallel()` or `crawl_sitemap_server_parallel()`
291
-
292
- 2. Client-side Parallel Processing
293
- - Good for special cases requiring local control
294
- - Uses Python multiprocessing
295
- - More network overhead
296
- - Use: `crawl_urls_parallel()` or `crawl_sitemap_parallel()`
297
-
298
- 3. Async Processing
299
- - Best for integration with async applications
300
- - Good for real-time processing
301
- - Use: `crawl_url_async()`, `crawl_urls_async()`, or `crawl_sitemap_async()`
302
-
303
- ## License
304
-
305
- MIT License
306
-
307
- ## Credits
308
-
309
- Created by [Peter Tam](https://petertam.pro)
@@ -1,5 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=lCviRhfLngSMehFJZwyK4LirPwbWEyZ0RJjCt5FkBcY,28304
2
- spiderforce4ai-1.1.dist-info/METADATA,sha256=lQfqXn0ifJOmOmLkgr8YTSYUFiu6-HS3YsRD0togylo,7769
3
- spiderforce4ai-1.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
- spiderforce4ai-1.1.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
- spiderforce4ai-1.1.dist-info/RECORD,,