spiderforce4ai 1.1__tar.gz → 1.2__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,298 @@
1
+ Metadata-Version: 2.2
2
+ Name: spiderforce4ai
3
+ Version: 1.2
4
+ Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
+ Home-page: https://petertam.pro
6
+ Author: Piotr Tamulewicz
7
+ Author-email: Piotr Tamulewicz <pt@petertam.pro>
8
+ License: MIT
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Requires-Python: >=3.11
15
+ Description-Content-Type: text/markdown
16
+ Requires-Dist: aiohttp>=3.8.0
17
+ Requires-Dist: asyncio>=3.4.3
18
+ Requires-Dist: rich>=10.0.0
19
+ Requires-Dist: aiofiles>=0.8.0
20
+ Requires-Dist: httpx>=0.24.0
21
+ Dynamic: author
22
+ Dynamic: home-page
23
+ Dynamic: requires-python
24
+
25
+ # SpiderForce4AI Python Wrapper
26
+
27
+ A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
28
+
29
+ ## Features
30
+
31
+ - HTML to Markdown conversion
32
+ - Parallel and async crawling support
33
+ - Sitemap processing
34
+ - Custom content selection
35
+ - Automatic retry mechanism
36
+ - Detailed progress tracking
37
+ - Webhook notifications
38
+ - Customizable reporting
39
+
40
+ ## Installation
41
+
42
+ ```bash
43
+ pip install spiderforce4ai
44
+ ```
45
+
46
+ ## Quick Start
47
+
48
+ ```python
49
+ from spiderforce4ai import SpiderForce4AI, CrawlConfig
50
+ from pathlib import Path
51
+
52
+ # Initialize crawler
53
+ spider = SpiderForce4AI("http://localhost:3004")
54
+
55
+ # Configure crawling options
56
+ config = CrawlConfig(
57
+ target_selector="article",
58
+ remove_selectors=[".ads", ".navigation"],
59
+ max_concurrent_requests=5,
60
+ save_reports=True
61
+ )
62
+
63
+ # Crawl a sitemap
64
+ results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
65
+ ```
66
+
67
+ ## Key Features
68
+
69
+ ### 1. Smart Retry Mechanism
70
+ - Automatically retries failed URLs
71
+ - Monitors failure ratio to prevent server overload
72
+ - Detailed retry statistics and progress tracking
73
+ - Aborts retries if failure rate exceeds 20%
74
+
75
+ ```python
76
+ # Retry behavior is automatic
77
+ config = CrawlConfig(
78
+ max_concurrent_requests=5,
79
+ request_delay=1.0 # Delay between retries
80
+ )
81
+ results = spider.crawl_urls_async(urls, config)
82
+ ```
83
+
84
+ ### 2. Custom Webhook Integration
85
+ - Flexible payload formatting
86
+ - Custom headers support
87
+ - Variable substitution in templates
88
+
89
+ ```python
90
+ config = CrawlConfig(
91
+ webhook_url="https://your-webhook.com",
92
+ webhook_headers={
93
+ "Authorization": "Bearer token",
94
+ "X-Custom-Header": "value"
95
+ },
96
+ webhook_payload_template='''{
97
+ "url": "{url}",
98
+ "content": "{markdown}",
99
+ "status": "{status}",
100
+ "custom_field": "value"
101
+ }'''
102
+ )
103
+ ```
104
+
105
+ ### 3. Flexible Report Generation
106
+ - Optional report saving
107
+ - Customizable report location
108
+ - Detailed success/failure statistics
109
+
110
+ ```python
111
+ config = CrawlConfig(
112
+ save_reports=True,
113
+ report_file=Path("custom_report.json"),
114
+ output_dir=Path("content")
115
+ )
116
+ ```
117
+
118
+ ## Crawling Methods
119
+
120
+ ### 1. Single URL Processing
121
+
122
+ ```python
123
+ # Synchronous
124
+ result = spider.crawl_url("https://example.com", config)
125
+
126
+ # Asynchronous
127
+ async def crawl():
128
+ result = await spider.crawl_url_async("https://example.com", config)
129
+ ```
130
+
131
+ ### 2. Multiple URLs
132
+
133
+ ```python
134
+ urls = ["https://example.com/page1", "https://example.com/page2"]
135
+
136
+ # Server-side parallel (recommended)
137
+ results = spider.crawl_urls_server_parallel(urls, config)
138
+
139
+ # Client-side parallel
140
+ results = spider.crawl_urls_parallel(urls, config)
141
+
142
+ # Asynchronous
143
+ async def crawl():
144
+ results = await spider.crawl_urls_async(urls, config)
145
+ ```
146
+
147
+ ### 3. Sitemap Processing
148
+
149
+ ```python
150
+ # Server-side parallel (recommended)
151
+ results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
152
+
153
+ # Client-side parallel
154
+ results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
155
+
156
+ # Asynchronous
157
+ async def crawl():
158
+ results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
159
+ ```
160
+
161
+ ## Configuration Options
162
+
163
+ ```python
164
+ config = CrawlConfig(
165
+ # Content Selection
166
+ target_selector="article", # Target element to extract
167
+ remove_selectors=[".ads", "#popup"], # Elements to remove
168
+ remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
169
+
170
+ # Processing
171
+ max_concurrent_requests=5, # Parallel processing limit
172
+ request_delay=0.5, # Delay between requests
173
+ timeout=30, # Request timeout
174
+
175
+ # Output
176
+ output_dir=Path("content"), # Output directory
177
+ save_reports=False, # Enable/disable report saving
178
+ report_file=Path("report.json"), # Report location
179
+
180
+ # Webhook
181
+ webhook_url="https://webhook.com", # Webhook endpoint
182
+ webhook_timeout=10, # Webhook timeout
183
+ webhook_headers={ # Custom headers
184
+ "Authorization": "Bearer token"
185
+ },
186
+ webhook_payload_template=''' # Custom payload format
187
+ {
188
+ "url": "{url}",
189
+ "content": "{markdown}",
190
+ "status": "{status}",
191
+ "error": "{error}",
192
+ "time": "{timestamp}"
193
+ }'''
194
+ )
195
+ ```
196
+
197
+ ## Progress Tracking
198
+
199
+ The package provides detailed progress information:
200
+
201
+ ```
202
+ Fetching sitemap from https://example.com/sitemap.xml...
203
+ Found 156 URLs in sitemap
204
+ [━━━━━━━━━━━━━━━━━━━━━━━━━━━━] 100% • 156/156 URLs
205
+
206
+ Retrying failed URLs: 18 (11.5% failed)
207
+ [━━━━━━━━━━━━━━━━━━━━━━━━━━━━] 100% • 18/18 retries
208
+
209
+ Crawling Summary:
210
+ Total URLs processed: 156
211
+ Initial failures: 18 (11.5%)
212
+ Final results:
213
+ ✓ Successful: 150
214
+ ✗ Failed: 6
215
+ Retry success rate: 12/18 (66.7%)
216
+ ```
217
+
218
+ ## Output Structure
219
+
220
+ ### 1. Directory Layout
221
+ ```
222
+ content/ # Output directory
223
+ ├── example-com-page1.md # Markdown files
224
+ ├── example-com-page2.md
225
+ └── report.json # Crawl report
226
+ ```
227
+
228
+ ### 2. Report Format
229
+ ```json
230
+ {
231
+ "timestamp": "2025-02-15T10:30:00",
232
+ "config": {
233
+ "target_selector": "article",
234
+ "remove_selectors": [".ads"]
235
+ },
236
+ "results": {
237
+ "successful": [...],
238
+ "failed": [...]
239
+ },
240
+ "summary": {
241
+ "total": 156,
242
+ "successful": 150,
243
+ "failed": 6
244
+ }
245
+ }
246
+ ```
247
+
248
+ ## Performance Optimization
249
+
250
+ 1. Server-side Parallel Processing
251
+ - Recommended for most cases
252
+ - Single HTTP request
253
+ - Reduced network overhead
254
+ - Built-in load balancing
255
+
256
+ 2. Client-side Parallel Processing
257
+ - Better control over processing
258
+ - Customizable concurrency
259
+ - Progress tracking per URL
260
+ - Automatic retry handling
261
+
262
+ 3. Asynchronous Processing
263
+ - Ideal for async applications
264
+ - Non-blocking operation
265
+ - Real-time progress updates
266
+ - Efficient resource usage
267
+
268
+ ## Error Handling
269
+
270
+ The package provides comprehensive error handling:
271
+
272
+ - Automatic retry for failed URLs
273
+ - Failure ratio monitoring
274
+ - Detailed error reporting
275
+ - Webhook error notifications
276
+ - Progress tracking during retries
277
+
278
+ ## Requirements
279
+
280
+ - Python 3.11+
281
+ - Running SpiderForce4AI service
282
+ - Internet connection
283
+
284
+ ## Dependencies
285
+
286
+ - aiohttp
287
+ - asyncio
288
+ - rich
289
+ - aiofiles
290
+ - httpx
291
+
292
+ ## License
293
+
294
+ MIT License
295
+
296
+ ## Credits
297
+
298
+ Created by [Peter Tam](https://petertam.pro)
@@ -0,0 +1,274 @@
1
+ # SpiderForce4AI Python Wrapper
2
+
3
+ A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
4
+
5
+ ## Features
6
+
7
+ - HTML to Markdown conversion
8
+ - Parallel and async crawling support
9
+ - Sitemap processing
10
+ - Custom content selection
11
+ - Automatic retry mechanism
12
+ - Detailed progress tracking
13
+ - Webhook notifications
14
+ - Customizable reporting
15
+
16
+ ## Installation
17
+
18
+ ```bash
19
+ pip install spiderforce4ai
20
+ ```
21
+
22
+ ## Quick Start
23
+
24
+ ```python
25
+ from spiderforce4ai import SpiderForce4AI, CrawlConfig
26
+ from pathlib import Path
27
+
28
+ # Initialize crawler
29
+ spider = SpiderForce4AI("http://localhost:3004")
30
+
31
+ # Configure crawling options
32
+ config = CrawlConfig(
33
+ target_selector="article",
34
+ remove_selectors=[".ads", ".navigation"],
35
+ max_concurrent_requests=5,
36
+ save_reports=True
37
+ )
38
+
39
+ # Crawl a sitemap
40
+ results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
41
+ ```
42
+
43
+ ## Key Features
44
+
45
+ ### 1. Smart Retry Mechanism
46
+ - Automatically retries failed URLs
47
+ - Monitors failure ratio to prevent server overload
48
+ - Detailed retry statistics and progress tracking
49
+ - Aborts retries if failure rate exceeds 20%
50
+
51
+ ```python
52
+ # Retry behavior is automatic
53
+ config = CrawlConfig(
54
+ max_concurrent_requests=5,
55
+ request_delay=1.0 # Delay between retries
56
+ )
57
+ results = spider.crawl_urls_async(urls, config)
58
+ ```
59
+
60
+ ### 2. Custom Webhook Integration
61
+ - Flexible payload formatting
62
+ - Custom headers support
63
+ - Variable substitution in templates
64
+
65
+ ```python
66
+ config = CrawlConfig(
67
+ webhook_url="https://your-webhook.com",
68
+ webhook_headers={
69
+ "Authorization": "Bearer token",
70
+ "X-Custom-Header": "value"
71
+ },
72
+ webhook_payload_template='''{
73
+ "url": "{url}",
74
+ "content": "{markdown}",
75
+ "status": "{status}",
76
+ "custom_field": "value"
77
+ }'''
78
+ )
79
+ ```
80
+
81
+ ### 3. Flexible Report Generation
82
+ - Optional report saving
83
+ - Customizable report location
84
+ - Detailed success/failure statistics
85
+
86
+ ```python
87
+ config = CrawlConfig(
88
+ save_reports=True,
89
+ report_file=Path("custom_report.json"),
90
+ output_dir=Path("content")
91
+ )
92
+ ```
93
+
94
+ ## Crawling Methods
95
+
96
+ ### 1. Single URL Processing
97
+
98
+ ```python
99
+ # Synchronous
100
+ result = spider.crawl_url("https://example.com", config)
101
+
102
+ # Asynchronous
103
+ async def crawl():
104
+ result = await spider.crawl_url_async("https://example.com", config)
105
+ ```
106
+
107
+ ### 2. Multiple URLs
108
+
109
+ ```python
110
+ urls = ["https://example.com/page1", "https://example.com/page2"]
111
+
112
+ # Server-side parallel (recommended)
113
+ results = spider.crawl_urls_server_parallel(urls, config)
114
+
115
+ # Client-side parallel
116
+ results = spider.crawl_urls_parallel(urls, config)
117
+
118
+ # Asynchronous
119
+ async def crawl():
120
+ results = await spider.crawl_urls_async(urls, config)
121
+ ```
122
+
123
+ ### 3. Sitemap Processing
124
+
125
+ ```python
126
+ # Server-side parallel (recommended)
127
+ results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
128
+
129
+ # Client-side parallel
130
+ results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
131
+
132
+ # Asynchronous
133
+ async def crawl():
134
+ results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
135
+ ```
136
+
137
+ ## Configuration Options
138
+
139
+ ```python
140
+ config = CrawlConfig(
141
+ # Content Selection
142
+ target_selector="article", # Target element to extract
143
+ remove_selectors=[".ads", "#popup"], # Elements to remove
144
+ remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
145
+
146
+ # Processing
147
+ max_concurrent_requests=5, # Parallel processing limit
148
+ request_delay=0.5, # Delay between requests
149
+ timeout=30, # Request timeout
150
+
151
+ # Output
152
+ output_dir=Path("content"), # Output directory
153
+ save_reports=False, # Enable/disable report saving
154
+ report_file=Path("report.json"), # Report location
155
+
156
+ # Webhook
157
+ webhook_url="https://webhook.com", # Webhook endpoint
158
+ webhook_timeout=10, # Webhook timeout
159
+ webhook_headers={ # Custom headers
160
+ "Authorization": "Bearer token"
161
+ },
162
+ webhook_payload_template=''' # Custom payload format
163
+ {
164
+ "url": "{url}",
165
+ "content": "{markdown}",
166
+ "status": "{status}",
167
+ "error": "{error}",
168
+ "time": "{timestamp}"
169
+ }'''
170
+ )
171
+ ```
172
+
173
+ ## Progress Tracking
174
+
175
+ The package provides detailed progress information:
176
+
177
+ ```
178
+ Fetching sitemap from https://example.com/sitemap.xml...
179
+ Found 156 URLs in sitemap
180
+ [━━━━━━━━━━━━━━━━━━━━━━━━━━━━] 100% • 156/156 URLs
181
+
182
+ Retrying failed URLs: 18 (11.5% failed)
183
+ [━━━━━━━━━━━━━━━━━━━━━━━━━━━━] 100% • 18/18 retries
184
+
185
+ Crawling Summary:
186
+ Total URLs processed: 156
187
+ Initial failures: 18 (11.5%)
188
+ Final results:
189
+ ✓ Successful: 150
190
+ ✗ Failed: 6
191
+ Retry success rate: 12/18 (66.7%)
192
+ ```
193
+
194
+ ## Output Structure
195
+
196
+ ### 1. Directory Layout
197
+ ```
198
+ content/ # Output directory
199
+ ├── example-com-page1.md # Markdown files
200
+ ├── example-com-page2.md
201
+ └── report.json # Crawl report
202
+ ```
203
+
204
+ ### 2. Report Format
205
+ ```json
206
+ {
207
+ "timestamp": "2025-02-15T10:30:00",
208
+ "config": {
209
+ "target_selector": "article",
210
+ "remove_selectors": [".ads"]
211
+ },
212
+ "results": {
213
+ "successful": [...],
214
+ "failed": [...]
215
+ },
216
+ "summary": {
217
+ "total": 156,
218
+ "successful": 150,
219
+ "failed": 6
220
+ }
221
+ }
222
+ ```
223
+
224
+ ## Performance Optimization
225
+
226
+ 1. Server-side Parallel Processing
227
+ - Recommended for most cases
228
+ - Single HTTP request
229
+ - Reduced network overhead
230
+ - Built-in load balancing
231
+
232
+ 2. Client-side Parallel Processing
233
+ - Better control over processing
234
+ - Customizable concurrency
235
+ - Progress tracking per URL
236
+ - Automatic retry handling
237
+
238
+ 3. Asynchronous Processing
239
+ - Ideal for async applications
240
+ - Non-blocking operation
241
+ - Real-time progress updates
242
+ - Efficient resource usage
243
+
244
+ ## Error Handling
245
+
246
+ The package provides comprehensive error handling:
247
+
248
+ - Automatic retry for failed URLs
249
+ - Failure ratio monitoring
250
+ - Detailed error reporting
251
+ - Webhook error notifications
252
+ - Progress tracking during retries
253
+
254
+ ## Requirements
255
+
256
+ - Python 3.11+
257
+ - Running SpiderForce4AI service
258
+ - Internet connection
259
+
260
+ ## Dependencies
261
+
262
+ - aiohttp
263
+ - asyncio
264
+ - rich
265
+ - aiofiles
266
+ - httpx
267
+
268
+ ## License
269
+
270
+ MIT License
271
+
272
+ ## Credits
273
+
274
+ Created by [Peter Tam](https://petertam.pro)
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "spiderforce4ai"
7
- version = "1.1"
7
+ version = "1.2"
8
8
  description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service"
9
9
  readme = "README.md"
10
10
  authors = [{name = "Piotr Tamulewicz", email = "pt@petertam.pro"}]
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
3
3
 
4
4
  setup(
5
5
  name="spiderforce4ai",
6
- version="1.1",
6
+ version="1.2",
7
7
  author="Piotr Tamulewicz",
8
8
  author_email="pt@petertam.pro",
9
9
  description="Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service",
@@ -535,8 +535,13 @@ class SpiderForce4AI:
535
535
  results = initial_results
536
536
  else:
537
537
  retry_results = await self._retry_failed_urls(failed_results, config, progress)
538
- # Replace failed results with retry results
539
- results = [r for r in initial_results if r.status == "success"] + retry_results
538
+ # Update results list by replacing failed results with successful retries
539
+ results = initial_results.copy()
540
+ for retry_result in retry_results:
541
+ for i, result in enumerate(results):
542
+ if result.url == retry_result.url:
543
+ results[i] = retry_result
544
+ break
540
545
  else:
541
546
  results = initial_results
542
547
 
@@ -661,12 +666,27 @@ class SpiderForce4AI:
661
666
  console.print(f"\n[yellow]Retrying failed URLs: {failed_count} ({failure_ratio:.1f}% failed)[/yellow]")
662
667
  for result in failed_results:
663
668
  new_result = _process_url_parallel((result.url, self.base_url, config))
669
+
670
+ # Save markdown and trigger webhook for successful retries
664
671
  if new_result.status == "success":
665
672
  console.print(f"[green]✓ Retry successful: {result.url}[/green]")
666
- # Replace the failed result with the successful retry
667
- results[results.index(result)] = new_result
673
+ # Save markdown if output directory is configured
674
+ if config.output_dir and new_result.markdown:
675
+ filepath = config.output_dir / f"{slugify(new_result.url)}.md"
676
+ with open(filepath, 'w', encoding='utf-8') as f:
677
+ f.write(new_result.markdown)
678
+ # Send webhook for successful retry
679
+ _send_webhook_sync(new_result, config)
668
680
  else:
669
681
  console.print(f"[red]✗ Retry failed: {result.url} - {new_result.error}[/red]")
682
+ # Send webhook for failed retry
683
+ _send_webhook_sync(new_result, config)
684
+
685
+ # Update results list
686
+ for i, r in enumerate(results):
687
+ if r.url == new_result.url:
688
+ results[i] = new_result
689
+ break
670
690
 
671
691
  # Calculate final statistics
672
692
  final_successful = len([r for r in results if r.status == "success"])