spiderforce4ai 1.0__py3-none-any.whl → 1.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -445,7 +445,11 @@ class SpiderForce4AI:
445
445
  if not failed_results:
446
446
  return []
447
447
 
448
- console.print("\n[yellow]Retrying failed URLs...[/yellow]")
448
+ failed_count = len(failed_results)
449
+ total_count = len([r for r in self.crawl_results])
450
+ failure_ratio = (failed_count / total_count) * 100
451
+
452
+ console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
449
453
  retry_results = []
450
454
 
451
455
  # Create a new progress bar if one wasn't provided
@@ -531,8 +535,13 @@ class SpiderForce4AI:
531
535
  results = initial_results
532
536
  else:
533
537
  retry_results = await self._retry_failed_urls(failed_results, config, progress)
534
- # Replace failed results with retry results
535
- results = [r for r in initial_results if r.status == "success"] + retry_results
538
+ # Update results list by replacing failed results with successful retries
539
+ results = initial_results.copy()
540
+ for retry_result in retry_results:
541
+ for i, result in enumerate(results):
542
+ if result.url == retry_result.url:
543
+ results[i] = retry_result
544
+ break
536
545
  else:
537
546
  results = initial_results
538
547
 
@@ -652,15 +661,32 @@ class SpiderForce4AI:
652
661
  if failure_ratio > 20:
653
662
  console.print(f"\n[red]Failure ratio too high ({failure_ratio:.1f}%) - aborting retry due to possible server overload[/red]")
654
663
  else:
655
- console.print("\n[yellow]Retrying failed URLs...[/yellow]")
664
+ failed_count = len(failed_results)
665
+ failure_ratio = (failed_count / total_urls) * 100
666
+ console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
656
667
  for result in failed_results:
657
668
  new_result = _process_url_parallel((result.url, self.base_url, config))
669
+
670
+ # Save markdown and trigger webhook for successful retries
658
671
  if new_result.status == "success":
659
672
  console.print(f"[green]✓ Retry successful: {result.url}[/green]")
660
- # Replace the failed result with the successful retry
661
- results[results.index(result)] = new_result
673
+ # Save markdown if output directory is configured
674
+ if config.output_dir and new_result.markdown:
675
+ filepath = config.output_dir / f"{slugify(new_result.url)}.md"
676
+ with open(filepath, 'w', encoding='utf-8') as f:
677
+ f.write(new_result.markdown)
678
+ # Send webhook for successful retry
679
+ _send_webhook_sync(new_result, config)
662
680
  else:
663
681
  console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
682
+ # Send webhook for failed retry
683
+ _send_webhook_sync(new_result, config)
684
+
685
+ # Update results list
686
+ for i, r in enumerate(results):
687
+ if r.url == new_result.url:
688
+ results[i] = new_result
689
+ break
664
690
 
665
691
  # Calculate final statistics
666
692
  final_successful = len([r for r in results if r.status == "success"])
@@ -0,0 +1,298 @@
1
+ Metadata-Version: 2.2
2
+ Name: spiderforce4ai
3
+ Version: 1.2
4
+ Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
+ Home-page: https://petertam.pro
6
+ Author: Piotr Tamulewicz
7
+ Author-email: Piotr Tamulewicz <pt@petertam.pro>
8
+ License: MIT
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Requires-Python: >=3.11
15
+ Description-Content-Type: text/markdown
16
+ Requires-Dist: aiohttp>=3.8.0
17
+ Requires-Dist: asyncio>=3.4.3
18
+ Requires-Dist: rich>=10.0.0
19
+ Requires-Dist: aiofiles>=0.8.0
20
+ Requires-Dist: httpx>=0.24.0
21
+ Dynamic: author
22
+ Dynamic: home-page
23
+ Dynamic: requires-python
24
+
25
+ # SpiderForce4AI Python Wrapper
26
+
27
+ A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
28
+
29
+ ## Features
30
+
31
+ - HTML to Markdown conversion
32
+ - Parallel and async crawling support
33
+ - Sitemap processing
34
+ - Custom content selection
35
+ - Automatic retry mechanism
36
+ - Detailed progress tracking
37
+ - Webhook notifications
38
+ - Customizable reporting
39
+
40
+ ## Installation
41
+
42
+ ```bash
43
+ pip install spiderforce4ai
44
+ ```
45
+
46
+ ## Quick Start
47
+
48
+ ```python
49
+ from spiderforce4ai import SpiderForce4AI, CrawlConfig
50
+ from pathlib import Path
51
+
52
+ # Initialize crawler
53
+ spider = SpiderForce4AI("http://localhost:3004")
54
+
55
+ # Configure crawling options
56
+ config = CrawlConfig(
57
+ target_selector="article",
58
+ remove_selectors=[".ads", ".navigation"],
59
+ max_concurrent_requests=5,
60
+ save_reports=True
61
+ )
62
+
63
+ # Crawl a sitemap
64
+ results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
65
+ ```
66
+
67
+ ## Key Features
68
+
69
+ ### 1. Smart Retry Mechanism
70
+ - Automatically retries failed URLs
71
+ - Monitors failure ratio to prevent server overload
72
+ - Detailed retry statistics and progress tracking
73
+ - Aborts retries if failure rate exceeds 20%
74
+
75
+ ```python
76
+ # Retry behavior is automatic
77
+ config = CrawlConfig(
78
+ max_concurrent_requests=5,
79
+ request_delay=1.0 # Delay between retries
80
+ )
81
+ results = spider.crawl_urls_async(urls, config)
82
+ ```
83
+
84
+ ### 2. Custom Webhook Integration
85
+ - Flexible payload formatting
86
+ - Custom headers support
87
+ - Variable substitution in templates
88
+
89
+ ```python
90
+ config = CrawlConfig(
91
+ webhook_url="https://your-webhook.com",
92
+ webhook_headers={
93
+ "Authorization": "Bearer token",
94
+ "X-Custom-Header": "value"
95
+ },
96
+ webhook_payload_template='''{
97
+ "url": "{url}",
98
+ "content": "{markdown}",
99
+ "status": "{status}",
100
+ "custom_field": "value"
101
+ }'''
102
+ )
103
+ ```
104
+
105
+ ### 3. Flexible Report Generation
106
+ - Optional report saving
107
+ - Customizable report location
108
+ - Detailed success/failure statistics
109
+
110
+ ```python
111
+ config = CrawlConfig(
112
+ save_reports=True,
113
+ report_file=Path("custom_report.json"),
114
+ output_dir=Path("content")
115
+ )
116
+ ```
117
+
118
+ ## Crawling Methods
119
+
120
+ ### 1. Single URL Processing
121
+
122
+ ```python
123
+ # Synchronous
124
+ result = spider.crawl_url("https://example.com", config)
125
+
126
+ # Asynchronous
127
+ async def crawl():
128
+ result = await spider.crawl_url_async("https://example.com", config)
129
+ ```
130
+
131
+ ### 2. Multiple URLs
132
+
133
+ ```python
134
+ urls = ["https://example.com/page1", "https://example.com/page2"]
135
+
136
+ # Server-side parallel (recommended)
137
+ results = spider.crawl_urls_server_parallel(urls, config)
138
+
139
+ # Client-side parallel
140
+ results = spider.crawl_urls_parallel(urls, config)
141
+
142
+ # Asynchronous
143
+ async def crawl():
144
+ results = await spider.crawl_urls_async(urls, config)
145
+ ```
146
+
147
+ ### 3. Sitemap Processing
148
+
149
+ ```python
150
+ # Server-side parallel (recommended)
151
+ results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
152
+
153
+ # Client-side parallel
154
+ results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
155
+
156
+ # Asynchronous
157
+ async def crawl():
158
+ results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
159
+ ```
160
+
161
+ ## Configuration Options
162
+
163
+ ```python
164
+ config = CrawlConfig(
165
+ # Content Selection
166
+ target_selector="article", # Target element to extract
167
+ remove_selectors=[".ads", "#popup"], # Elements to remove
168
+ remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
169
+
170
+ # Processing
171
+ max_concurrent_requests=5, # Parallel processing limit
172
+ request_delay=0.5, # Delay between requests
173
+ timeout=30, # Request timeout
174
+
175
+ # Output
176
+ output_dir=Path("content"), # Output directory
177
+ save_reports=False, # Enable/disable report saving
178
+ report_file=Path("report.json"), # Report location
179
+
180
+ # Webhook
181
+ webhook_url="https://webhook.com", # Webhook endpoint
182
+ webhook_timeout=10, # Webhook timeout
183
+ webhook_headers={ # Custom headers
184
+ "Authorization": "Bearer token"
185
+ },
186
+ webhook_payload_template=''' # Custom payload format
187
+ {
188
+ "url": "{url}",
189
+ "content": "{markdown}",
190
+ "status": "{status}",
191
+ "error": "{error}",
192
+ "time": "{timestamp}"
193
+ }'''
194
+ )
195
+ ```
196
+
197
+ ## Progress Tracking
198
+
199
+ The package provides detailed progress information:
200
+
201
+ ```
202
+ Fetching sitemap from https://example.com/sitemap.xml...
203
+ Found 156 URLs in sitemap
204
+ [━━━━━━━━━━━━━━━━━━━━━━━━━━━━] 100% • 156/156 URLs
205
+
206
+ Retrying failed URLs: 18 (11.5% failed)
207
+ [━━━━━━━━━━━━━━━━━━━━━━━━━━━━] 100% • 18/18 retries
208
+
209
+ Crawling Summary:
210
+ Total URLs processed: 156
211
+ Initial failures: 18 (11.5%)
212
+ Final results:
213
+ ✓ Successful: 150
214
+ ✗ Failed: 6
215
+ Retry success rate: 12/18 (66.7%)
216
+ ```
217
+
218
+ ## Output Structure
219
+
220
+ ### 1. Directory Layout
221
+ ```
222
+ content/ # Output directory
223
+ ├── example-com-page1.md # Markdown files
224
+ ├── example-com-page2.md
225
+ └── report.json # Crawl report
226
+ ```
227
+
228
+ ### 2. Report Format
229
+ ```json
230
+ {
231
+ "timestamp": "2025-02-15T10:30:00",
232
+ "config": {
233
+ "target_selector": "article",
234
+ "remove_selectors": [".ads"]
235
+ },
236
+ "results": {
237
+ "successful": [...],
238
+ "failed": [...]
239
+ },
240
+ "summary": {
241
+ "total": 156,
242
+ "successful": 150,
243
+ "failed": 6
244
+ }
245
+ }
246
+ ```
247
+
248
+ ## Performance Optimization
249
+
250
+ 1. Server-side Parallel Processing
251
+ - Recommended for most cases
252
+ - Single HTTP request
253
+ - Reduced network overhead
254
+ - Built-in load balancing
255
+
256
+ 2. Client-side Parallel Processing
257
+ - Better control over processing
258
+ - Customizable concurrency
259
+ - Progress tracking per URL
260
+ - Automatic retry handling
261
+
262
+ 3. Asynchronous Processing
263
+ - Ideal for async applications
264
+ - Non-blocking operation
265
+ - Real-time progress updates
266
+ - Efficient resource usage
267
+
268
+ ## Error Handling
269
+
270
+ The package provides comprehensive error handling:
271
+
272
+ - Automatic retry for failed URLs
273
+ - Failure ratio monitoring
274
+ - Detailed error reporting
275
+ - Webhook error notifications
276
+ - Progress tracking during retries
277
+
278
+ ## Requirements
279
+
280
+ - Python 3.11+
281
+ - Running SpiderForce4AI service
282
+ - Internet connection
283
+
284
+ ## Dependencies
285
+
286
+ - aiohttp
287
+ - asyncio
288
+ - rich
289
+ - aiofiles
290
+ - httpx
291
+
292
+ ## License
293
+
294
+ MIT License
295
+
296
+ ## Credits
297
+
298
+ Created by [Peter Tam](https://petertam.pro)
@@ -0,0 +1,5 @@
1
+ spiderforce4ai/__init__.py,sha256=BHsdGGxEyS4RHbHyTnYRBE4oRy2i1pGSrEt_LT4vKWc,29384
2
+ spiderforce4ai-1.2.dist-info/METADATA,sha256=I5gmglzuRXSKwRc0lWk2Vslnx_4PIffIwjJ-SOTeYpU,7183
3
+ spiderforce4ai-1.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
+ spiderforce4ai-1.2.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
+ spiderforce4ai-1.2.dist-info/RECORD,,
@@ -1,309 +0,0 @@
1
- Metadata-Version: 2.2
2
- Name: spiderforce4ai
3
- Version: 1.0
4
- Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
- Home-page: https://petertam.pro
6
- Author: Piotr Tamulewicz
7
- Author-email: Piotr Tamulewicz <pt@petertam.pro>
8
- License: MIT
9
- Classifier: Development Status :: 4 - Beta
10
- Classifier: Intended Audience :: Developers
11
- Classifier: License :: OSI Approved :: MIT License
12
- Classifier: Programming Language :: Python :: 3.11
13
- Classifier: Programming Language :: Python :: 3.12
14
- Requires-Python: >=3.11
15
- Description-Content-Type: text/markdown
16
- Requires-Dist: aiohttp>=3.8.0
17
- Requires-Dist: asyncio>=3.4.3
18
- Requires-Dist: rich>=10.0.0
19
- Requires-Dist: aiofiles>=0.8.0
20
- Requires-Dist: httpx>=0.24.0
21
- Dynamic: author
22
- Dynamic: home-page
23
- Dynamic: requires-python
24
-
25
- # SpiderForce4AI Python Wrapper
26
-
27
- A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
28
-
29
- ## Quick Start (Minimal Setup)
30
-
31
- ```python
32
- from spiderforce4ai import SpiderForce4AI, CrawlConfig
33
-
34
- # Initialize with your service URL
35
- spider = SpiderForce4AI("http://localhost:3004")
36
-
37
- # Create default config
38
- config = CrawlConfig()
39
-
40
- # Crawl a single URL
41
- result = spider.crawl_url("https://example.com", config)
42
- ```
43
-
44
- ## Installation
45
-
46
- ```bash
47
- pip install spiderforce4ai
48
- ```
49
-
50
- ## Crawling Methods
51
-
52
- ### 1. Single URL
53
-
54
- ```python
55
- # Basic usage
56
- result = spider.crawl_url("https://example.com", config)
57
-
58
- # Async version
59
- async def crawl():
60
- result = await spider.crawl_url_async("https://example.com", config)
61
- ```
62
-
63
- ### 2. Multiple URLs
64
-
65
- ```python
66
- urls = [
67
- "https://example.com/page1",
68
- "https://example.com/page2"
69
- ]
70
-
71
- # Client-side parallel (using multiprocessing)
72
- results = spider.crawl_urls_parallel(urls, config)
73
-
74
- # Server-side parallel (single request)
75
- results = spider.crawl_urls_server_parallel(urls, config)
76
-
77
- # Async version
78
- async def crawl():
79
- results = await spider.crawl_urls_async(urls, config)
80
- ```
81
-
82
- ### 3. Sitemap Crawling
83
-
84
- ```python
85
- # Server-side parallel (recommended)
86
- results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
87
-
88
- # Client-side parallel
89
- results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
90
-
91
- # Async version
92
- async def crawl():
93
- results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
94
- ```
95
-
96
- ## Configuration Options
97
-
98
- All configuration options are optional with sensible defaults:
99
-
100
- ```python
101
- from pathlib import Path
102
-
103
- config = CrawlConfig(
104
- # Content Selection (all optional)
105
- target_selector="article", # Specific element to extract
106
- remove_selectors=[ # Elements to remove
107
- ".ads",
108
- "#popup",
109
- ".navigation",
110
- ".footer"
111
- ],
112
- remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
113
-
114
- # Processing Settings
115
- max_concurrent_requests=1, # For client-side parallel processing
116
- request_delay=0.5, # Delay between requests (seconds)
117
- timeout=30, # Request timeout (seconds)
118
-
119
- # Output Settings
120
- output_dir=Path("spiderforce_reports"), # Default directory for files
121
- webhook_url="https://your-webhook.com", # Real-time notifications
122
- webhook_timeout=10, # Webhook timeout
123
- webhook_headers={ # Optional custom headers for webhook
124
- "Authorization": "Bearer your-token",
125
- "X-Custom-Header": "value"
126
- },
127
- webhook_payload_template='''{ # Optional custom webhook payload template
128
- "crawled_url": "{url}",
129
- "content": "{markdown}",
130
- "crawl_status": "{status}",
131
- "crawl_error": "{error}",
132
- "crawl_time": "{timestamp}",
133
- "custom_field": "your-value"
134
- }''',
135
- save_reports=False, # Whether to save crawl reports (default: False)
136
- report_file=Path("crawl_report.json") # Report location (used only if save_reports=True)
137
- )
138
- ```
139
-
140
- ## Real-World Examples
141
-
142
- ### 1. Basic Blog Crawling
143
-
144
- ```python
145
- from spiderforce4ai import SpiderForce4AI, CrawlConfig
146
- from pathlib import Path
147
-
148
- spider = SpiderForce4AI("http://localhost:3004")
149
- config = CrawlConfig(
150
- target_selector="article.post-content",
151
- output_dir=Path("blog_content")
152
- )
153
-
154
- result = spider.crawl_url("https://example.com/blog-post", config)
155
- ```
156
-
157
- ### 2. Parallel Website Crawling
158
-
159
- ```python
160
- config = CrawlConfig(
161
- remove_selectors=[
162
- ".navigation",
163
- ".footer",
164
- ".ads",
165
- "#cookie-notice"
166
- ],
167
- max_concurrent_requests=5,
168
- output_dir=Path("website_content"),
169
- webhook_url="https://your-webhook.com/endpoint"
170
- )
171
-
172
- # Using server-side parallel processing
173
- results = spider.crawl_urls_server_parallel([
174
- "https://example.com/page1",
175
- "https://example.com/page2",
176
- "https://example.com/page3"
177
- ], config)
178
- ```
179
-
180
- ### 3. Full Sitemap Processing
181
-
182
- ```python
183
- config = CrawlConfig(
184
- target_selector="main",
185
- remove_selectors=[".sidebar", ".comments"],
186
- output_dir=Path("site_content"),
187
- report_file=Path("crawl_report.json")
188
- )
189
-
190
- results = spider.crawl_sitemap_server_parallel(
191
- "https://example.com/sitemap.xml",
192
- config
193
- )
194
- ```
195
-
196
- ## Output Structure
197
-
198
- ### 1. Directory Layout
199
- ```
200
- spiderforce_reports/ # Default output directory
201
- ├── example-com-page1.md # Converted markdown files
202
- ├── example-com-page2.md
203
- └── crawl_report.json # Crawl report
204
- ```
205
-
206
- ### 2. Markdown Files
207
- Each file is named using a slugified version of the URL:
208
- ```markdown
209
- # Page Title
210
-
211
- Content converted to clean markdown...
212
- ```
213
-
214
- ### 3. Crawl Report
215
- ```json
216
- {
217
- "timestamp": "2025-02-15T10:30:00.123456",
218
- "config": {
219
- "target_selector": "article",
220
- "remove_selectors": [".ads", "#popup"]
221
- },
222
- "results": {
223
- "successful": [
224
- {
225
- "url": "https://example.com/page1",
226
- "status": "success",
227
- "markdown": "# Page Title\n\nContent...",
228
- "timestamp": "2025-02-15T10:30:00.123456"
229
- }
230
- ],
231
- "failed": [
232
- {
233
- "url": "https://example.com/page2",
234
- "status": "failed",
235
- "error": "HTTP 404: Not Found",
236
- "timestamp": "2025-02-15T10:30:01.123456"
237
- }
238
- ]
239
- },
240
- "summary": {
241
- "total": 2,
242
- "successful": 1,
243
- "failed": 1
244
- }
245
- }
246
- ```
247
-
248
- ### 4. Webhook Notifications
249
- If configured, real-time updates are sent for each processed URL:
250
- ```json
251
- {
252
- "url": "https://example.com/page1",
253
- "status": "success",
254
- "markdown": "# Page Title\n\nContent...",
255
- "timestamp": "2025-02-15T10:30:00.123456",
256
- "config": {
257
- "target_selector": "article",
258
- "remove_selectors": [".ads", "#popup"]
259
- }
260
- }
261
- ```
262
-
263
- ## Error Handling
264
-
265
- The package handles various types of errors gracefully:
266
- - Network errors
267
- - Timeout errors
268
- - Invalid URLs
269
- - Missing content
270
- - Service errors
271
-
272
- All errors are:
273
- 1. Logged in the console
274
- 2. Included in the JSON report
275
- 3. Sent via webhook (if configured)
276
- 4. Available in the results list
277
-
278
- ## Requirements
279
-
280
- - Python 3.11 or later
281
- - Running SpiderForce4AI service
282
- - Internet connection
283
-
284
- ## Performance Considerations
285
-
286
- 1. Server-side Parallel Processing
287
- - Best for most cases
288
- - Single HTTP request for multiple URLs
289
- - Less network overhead
290
- - Use: `crawl_urls_server_parallel()` or `crawl_sitemap_server_parallel()`
291
-
292
- 2. Client-side Parallel Processing
293
- - Good for special cases requiring local control
294
- - Uses Python multiprocessing
295
- - More network overhead
296
- - Use: `crawl_urls_parallel()` or `crawl_sitemap_parallel()`
297
-
298
- 3. Async Processing
299
- - Best for integration with async applications
300
- - Good for real-time processing
301
- - Use: `crawl_url_async()`, `crawl_urls_async()`, or `crawl_sitemap_async()`
302
-
303
- ## License
304
-
305
- MIT License
306
-
307
- ## Credits
308
-
309
- Created by [Peter Tam](https://petertam.pro)
@@ -1,5 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=8WEcryB8fckf5yIvH55s7a5FtxvK_AhXdi_dyaqqing,27929
2
- spiderforce4ai-1.0.dist-info/METADATA,sha256=VqydJoQcHkzvIhYTPeH3j8ZSHK-lGbo1xmZwQZk6w2s,7769
3
- spiderforce4ai-1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
- spiderforce4ai-1.0.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
- spiderforce4ai-1.0.dist-info/RECORD,,