spiderforce4ai 2.6.6__py3-none-any.whl → 2.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/__init__.py +89 -134
- spiderforce4ai-2.6.8.dist-info/METADATA +789 -0
- spiderforce4ai-2.6.8.dist-info/RECORD +7 -0
- {spiderforce4ai-2.6.6.dist-info → spiderforce4ai-2.6.8.dist-info}/WHEEL +1 -1
- spiderforce4ai-2.6.6.dist-info/METADATA +0 -336
- spiderforce4ai-2.6.6.dist-info/RECORD +0 -7
- {spiderforce4ai-2.6.6.dist-info → spiderforce4ai-2.6.8.dist-info}/entry_points.txt +0 -0
- {spiderforce4ai-2.6.6.dist-info → spiderforce4ai-2.6.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,7 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=p_ybuwvTD7bTelORBzAkomUQrc69WvOmu3owHKlzp0A,42231
|
2
|
+
spiderforce4ai/post_extraction_agent.py,sha256=7N2VYCfsfIh-my-Sc0_lnhmsfb3nyIbDOpnI007M1DM,19075
|
3
|
+
spiderforce4ai-2.6.8.dist-info/METADATA,sha256=QXMvOkWgOgNb4HL3RKgyPMlsSrOeleQlT-9ma0FRzQs,25726
|
4
|
+
spiderforce4ai-2.6.8.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
5
|
+
spiderforce4ai-2.6.8.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
+
spiderforce4ai-2.6.8.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
+
spiderforce4ai-2.6.8.dist-info/RECORD,,
|
@@ -1,336 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.2
|
2
|
-
Name: spiderforce4ai
|
3
|
-
Version: 2.6.6
|
4
|
-
Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service with LLM post-processing
|
5
|
-
Home-page: https://petertam.pro
|
6
|
-
Author: Piotr Tamulewicz
|
7
|
-
Author-email: Piotr Tamulewicz <pt@petertam.pro>
|
8
|
-
Project-URL: Homepage, https://petertam.pro
|
9
|
-
Project-URL: Documentation, https://petertam.pro/docs/spiderforce4ai
|
10
|
-
Project-URL: Repository, https://github.com/yourusername/spiderforce4ai
|
11
|
-
Project-URL: Bug Tracker, https://github.com/yourusername/spiderforce4ai/issues
|
12
|
-
Keywords: web-scraping,markdown,html-to-markdown,llm,ai,content-extraction,async,parallel-processing
|
13
|
-
Classifier: Development Status :: 4 - Beta
|
14
|
-
Classifier: Intended Audience :: Developers
|
15
|
-
Classifier: License :: OSI Approved :: MIT License
|
16
|
-
Classifier: Programming Language :: Python :: 3.11
|
17
|
-
Classifier: Programming Language :: Python :: 3.12
|
18
|
-
Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content
|
19
|
-
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
20
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
21
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
22
|
-
Requires-Python: >=3.11
|
23
|
-
Description-Content-Type: text/markdown
|
24
|
-
Requires-Dist: aiohttp>=3.8.0
|
25
|
-
Requires-Dist: asyncio>=3.4.3
|
26
|
-
Requires-Dist: rich>=10.0.0
|
27
|
-
Requires-Dist: aiofiles>=0.8.0
|
28
|
-
Requires-Dist: httpx>=0.24.0
|
29
|
-
Requires-Dist: litellm>=1.26.0
|
30
|
-
Requires-Dist: pydantic>=2.6.0
|
31
|
-
Requires-Dist: requests>=2.31.0
|
32
|
-
Requires-Dist: aiofiles>=23.2.1
|
33
|
-
Requires-Dist: et-xmlfile>=1.1.0
|
34
|
-
Requires-Dist: multidict>=6.0.4
|
35
|
-
Requires-Dist: openai>=1.12.0
|
36
|
-
Requires-Dist: pandas>=2.2.0
|
37
|
-
Requires-Dist: numpy>=1.26.0
|
38
|
-
Requires-Dist: yarl>=1.9.4
|
39
|
-
Requires-Dist: typing_extensions>=4.9.0
|
40
|
-
Provides-Extra: dev
|
41
|
-
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
42
|
-
Requires-Dist: pytest-asyncio>=0.21.1; extra == "dev"
|
43
|
-
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
44
|
-
Requires-Dist: black>=23.7.0; extra == "dev"
|
45
|
-
Requires-Dist: isort>=5.12.0; extra == "dev"
|
46
|
-
Requires-Dist: mypy>=1.4.1; extra == "dev"
|
47
|
-
Requires-Dist: ruff>=0.1.8; extra == "dev"
|
48
|
-
Requires-Dist: pre-commit>=3.5.0; extra == "dev"
|
49
|
-
Provides-Extra: test
|
50
|
-
Requires-Dist: pytest>=7.4.0; extra == "test"
|
51
|
-
Requires-Dist: pytest-asyncio>=0.21.1; extra == "test"
|
52
|
-
Requires-Dist: pytest-cov>=4.1.0; extra == "test"
|
53
|
-
Requires-Dist: pytest-mock>=3.12.0; extra == "test"
|
54
|
-
Requires-Dist: coverage>=7.4.0; extra == "test"
|
55
|
-
Provides-Extra: docs
|
56
|
-
Requires-Dist: sphinx>=7.1.0; extra == "docs"
|
57
|
-
Requires-Dist: sphinx-rtd-theme>=1.3.0; extra == "docs"
|
58
|
-
Requires-Dist: myst-parser>=2.0.0; extra == "docs"
|
59
|
-
Dynamic: author
|
60
|
-
Dynamic: home-page
|
61
|
-
Dynamic: requires-python
|
62
|
-
|
63
|
-
# SpiderForce4AI Python Wrapper
|
64
|
-
|
65
|
-
A Python package for web content crawling and HTML-to-Markdown conversion. Built for seamless integration with SpiderForce4AI service.
|
66
|
-
|
67
|
-
## Features
|
68
|
-
|
69
|
-
- HTML to Markdown conversion
|
70
|
-
- Parallel and async crawling support
|
71
|
-
- Sitemap processing
|
72
|
-
- Custom content selection
|
73
|
-
- Automatic retry mechanism
|
74
|
-
- Detailed progress tracking
|
75
|
-
- Webhook notifications
|
76
|
-
- Customizable reporting
|
77
|
-
|
78
|
-
## Installation
|
79
|
-
|
80
|
-
```bash
|
81
|
-
pip install spiderforce4ai
|
82
|
-
```
|
83
|
-
|
84
|
-
## Quick Start
|
85
|
-
|
86
|
-
```python
|
87
|
-
from spiderforce4ai import SpiderForce4AI, CrawlConfig
|
88
|
-
from pathlib import Path
|
89
|
-
|
90
|
-
# Initialize crawler
|
91
|
-
spider = SpiderForce4AI("http://localhost:3004")
|
92
|
-
|
93
|
-
# Configure crawling options
|
94
|
-
config = CrawlConfig(
|
95
|
-
target_selector="article",
|
96
|
-
remove_selectors=[".ads", ".navigation"],
|
97
|
-
max_concurrent_requests=5,
|
98
|
-
save_reports=True
|
99
|
-
)
|
100
|
-
|
101
|
-
# Crawl a sitemap
|
102
|
-
results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
|
103
|
-
```
|
104
|
-
|
105
|
-
## Key Features
|
106
|
-
|
107
|
-
### 1. Smart Retry Mechanism
|
108
|
-
- Automatically retries failed URLs
|
109
|
-
- Monitors failure ratio to prevent server overload
|
110
|
-
- Detailed retry statistics and progress tracking
|
111
|
-
- Aborts retries if failure rate exceeds 20%
|
112
|
-
|
113
|
-
```python
|
114
|
-
# Retry behavior is automatic
|
115
|
-
config = CrawlConfig(
|
116
|
-
max_concurrent_requests=5,
|
117
|
-
request_delay=1.0 # Delay between retries
|
118
|
-
)
|
119
|
-
results = spider.crawl_urls_async(urls, config)
|
120
|
-
```
|
121
|
-
|
122
|
-
### 2. Custom Webhook Integration
|
123
|
-
- Flexible payload formatting
|
124
|
-
- Custom headers support
|
125
|
-
- Variable substitution in templates
|
126
|
-
|
127
|
-
```python
|
128
|
-
config = CrawlConfig(
|
129
|
-
webhook_url="https://your-webhook.com",
|
130
|
-
webhook_headers={
|
131
|
-
"Authorization": "Bearer token",
|
132
|
-
"X-Custom-Header": "value"
|
133
|
-
},
|
134
|
-
webhook_payload_template='''{
|
135
|
-
"url": "{url}",
|
136
|
-
"content": "{markdown}",
|
137
|
-
"status": "{status}",
|
138
|
-
"custom_field": "value"
|
139
|
-
}'''
|
140
|
-
)
|
141
|
-
```
|
142
|
-
|
143
|
-
### 3. Flexible Report Generation
|
144
|
-
- Optional report saving
|
145
|
-
- Customizable report location
|
146
|
-
- Detailed success/failure statistics
|
147
|
-
|
148
|
-
```python
|
149
|
-
config = CrawlConfig(
|
150
|
-
save_reports=True,
|
151
|
-
report_file=Path("custom_report.json"),
|
152
|
-
output_dir=Path("content")
|
153
|
-
)
|
154
|
-
```
|
155
|
-
|
156
|
-
## Crawling Methods
|
157
|
-
|
158
|
-
### 1. Single URL Processing
|
159
|
-
|
160
|
-
```python
|
161
|
-
# Synchronous
|
162
|
-
result = spider.crawl_url("https://example.com", config)
|
163
|
-
|
164
|
-
# Asynchronous
|
165
|
-
async def crawl():
|
166
|
-
result = await spider.crawl_url_async("https://example.com", config)
|
167
|
-
```
|
168
|
-
|
169
|
-
### 2. Multiple URLs
|
170
|
-
|
171
|
-
```python
|
172
|
-
urls = ["https://example.com/page1", "https://example.com/page2"]
|
173
|
-
|
174
|
-
# Server-side parallel (recommended)
|
175
|
-
results = spider.crawl_urls_server_parallel(urls, config)
|
176
|
-
|
177
|
-
# Client-side parallel
|
178
|
-
results = spider.crawl_urls_parallel(urls, config)
|
179
|
-
|
180
|
-
# Asynchronous
|
181
|
-
async def crawl():
|
182
|
-
results = await spider.crawl_urls_async(urls, config)
|
183
|
-
```
|
184
|
-
|
185
|
-
### 3. Sitemap Processing
|
186
|
-
|
187
|
-
```python
|
188
|
-
# Server-side parallel (recommended)
|
189
|
-
results = spider.crawl_sitemap_server_parallel("https://example.com/sitemap.xml", config)
|
190
|
-
|
191
|
-
# Client-side parallel
|
192
|
-
results = spider.crawl_sitemap_parallel("https://example.com/sitemap.xml", config)
|
193
|
-
|
194
|
-
# Asynchronous
|
195
|
-
async def crawl():
|
196
|
-
results = await spider.crawl_sitemap_async("https://example.com/sitemap.xml", config)
|
197
|
-
```
|
198
|
-
|
199
|
-
## Configuration Options
|
200
|
-
|
201
|
-
```python
|
202
|
-
config = CrawlConfig(
|
203
|
-
# Content Selection
|
204
|
-
target_selector="article", # Target element to extract
|
205
|
-
remove_selectors=[".ads", "#popup"], # Elements to remove
|
206
|
-
remove_selectors_regex=["modal-\\d+"], # Regex patterns for removal
|
207
|
-
|
208
|
-
# Processing
|
209
|
-
max_concurrent_requests=5, # Parallel processing limit
|
210
|
-
request_delay=0.5, # Delay between requests
|
211
|
-
timeout=30, # Request timeout
|
212
|
-
|
213
|
-
# Output
|
214
|
-
output_dir=Path("content"), # Output directory
|
215
|
-
save_reports=False, # Enable/disable report saving
|
216
|
-
report_file=Path("report.json"), # Report location
|
217
|
-
|
218
|
-
# Webhook
|
219
|
-
webhook_url="https://webhook.com", # Webhook endpoint
|
220
|
-
webhook_timeout=10, # Webhook timeout
|
221
|
-
webhook_headers={ # Custom headers
|
222
|
-
"Authorization": "Bearer token"
|
223
|
-
},
|
224
|
-
webhook_payload_template=''' # Custom payload format
|
225
|
-
{
|
226
|
-
"url": "{url}",
|
227
|
-
"content": "{markdown}",
|
228
|
-
"status": "{status}",
|
229
|
-
"error": "{error}",
|
230
|
-
"time": "{timestamp}"
|
231
|
-
}'''
|
232
|
-
)
|
233
|
-
```
|
234
|
-
|
235
|
-
## Progress Tracking
|
236
|
-
|
237
|
-
The package provides detailed progress information:
|
238
|
-
|
239
|
-
```
|
240
|
-
Fetching sitemap from https://example.com/sitemap.xml...
|
241
|
-
Found 156 URLs in sitemap
|
242
|
-
[━━━━━━━━━━━━━━━━━━━━━━━━━━━━] 100% • 156/156 URLs
|
243
|
-
|
244
|
-
Retrying failed URLs: 18 (11.5% failed)
|
245
|
-
[━━━━━━━━━━━━━━━━━━━━━━━━━━━━] 100% • 18/18 retries
|
246
|
-
|
247
|
-
Crawling Summary:
|
248
|
-
Total URLs processed: 156
|
249
|
-
Initial failures: 18 (11.5%)
|
250
|
-
Final results:
|
251
|
-
✓ Successful: 150
|
252
|
-
✗ Failed: 6
|
253
|
-
Retry success rate: 12/18 (66.7%)
|
254
|
-
```
|
255
|
-
|
256
|
-
## Output Structure
|
257
|
-
|
258
|
-
### 1. Directory Layout
|
259
|
-
```
|
260
|
-
content/ # Output directory
|
261
|
-
├── example-com-page1.md # Markdown files
|
262
|
-
├── example-com-page2.md
|
263
|
-
└── report.json # Crawl report
|
264
|
-
```
|
265
|
-
|
266
|
-
### 2. Report Format
|
267
|
-
```json
|
268
|
-
{
|
269
|
-
"timestamp": "2025-02-15T10:30:00",
|
270
|
-
"config": {
|
271
|
-
"target_selector": "article",
|
272
|
-
"remove_selectors": [".ads"]
|
273
|
-
},
|
274
|
-
"results": {
|
275
|
-
"successful": [...],
|
276
|
-
"failed": [...]
|
277
|
-
},
|
278
|
-
"summary": {
|
279
|
-
"total": 156,
|
280
|
-
"successful": 150,
|
281
|
-
"failed": 6
|
282
|
-
}
|
283
|
-
}
|
284
|
-
```
|
285
|
-
|
286
|
-
## Performance Optimization
|
287
|
-
|
288
|
-
1. Server-side Parallel Processing
|
289
|
-
- Recommended for most cases
|
290
|
-
- Single HTTP request
|
291
|
-
- Reduced network overhead
|
292
|
-
- Built-in load balancing
|
293
|
-
|
294
|
-
2. Client-side Parallel Processing
|
295
|
-
- Better control over processing
|
296
|
-
- Customizable concurrency
|
297
|
-
- Progress tracking per URL
|
298
|
-
- Automatic retry handling
|
299
|
-
|
300
|
-
3. Asynchronous Processing
|
301
|
-
- Ideal for async applications
|
302
|
-
- Non-blocking operation
|
303
|
-
- Real-time progress updates
|
304
|
-
- Efficient resource usage
|
305
|
-
|
306
|
-
## Error Handling
|
307
|
-
|
308
|
-
The package provides comprehensive error handling:
|
309
|
-
|
310
|
-
- Automatic retry for failed URLs
|
311
|
-
- Failure ratio monitoring
|
312
|
-
- Detailed error reporting
|
313
|
-
- Webhook error notifications
|
314
|
-
- Progress tracking during retries
|
315
|
-
|
316
|
-
## Requirements
|
317
|
-
|
318
|
-
- Python 3.11+
|
319
|
-
- Running SpiderForce4AI service
|
320
|
-
- Internet connection
|
321
|
-
|
322
|
-
## Dependencies
|
323
|
-
|
324
|
-
- aiohttp
|
325
|
-
- asyncio
|
326
|
-
- rich
|
327
|
-
- aiofiles
|
328
|
-
- httpx
|
329
|
-
|
330
|
-
## License
|
331
|
-
|
332
|
-
MIT License
|
333
|
-
|
334
|
-
## Credits
|
335
|
-
|
336
|
-
Created by [Peter Tam](https://petertam.pro)
|
@@ -1,7 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=BlrRIrtpDUHjmDedqgXP1KbAAsAH9vwFPncUR5VGGyM,44804
|
2
|
-
spiderforce4ai/post_extraction_agent.py,sha256=7N2VYCfsfIh-my-Sc0_lnhmsfb3nyIbDOpnI007M1DM,19075
|
3
|
-
spiderforce4ai-2.6.6.dist-info/METADATA,sha256=eoFT4zgeNK3TkBEF5pKnf5IducFbm1quZnndCuXPf-c,9012
|
4
|
-
spiderforce4ai-2.6.6.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
-
spiderforce4ai-2.6.6.dist-info/entry_points.txt,sha256=ibARQxOlDiL1ho12zbDZt4Uq5RKSIk_qk159ZlZ46hc,59
|
6
|
-
spiderforce4ai-2.6.6.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
7
|
-
spiderforce4ai-2.6.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|