spiderforce4ai 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spiderforce4ai/__init__.py +89 -87
- {spiderforce4ai-0.1.4.dist-info → spiderforce4ai-0.1.5.dist-info}/METADATA +1 -1
- spiderforce4ai-0.1.5.dist-info/RECORD +5 -0
- spiderforce4ai-0.1.4.dist-info/RECORD +0 -5
- {spiderforce4ai-0.1.4.dist-info → spiderforce4ai-0.1.5.dist-info}/WHEEL +0 -0
- {spiderforce4ai-0.1.4.dist-info → spiderforce4ai-0.1.5.dist-info}/top_level.txt +0 -0
spiderforce4ai/__init__.py
CHANGED
@@ -1,13 +1,10 @@
|
|
1
|
-
|
2
|
-
SpiderForce4AI Python Wrapper
|
3
|
-
A Python package for interacting with SpiderForce4AI HTML-to-Markdown conversion service.
|
4
|
-
"""
|
1
|
+
# spiderforce4ai/__init__.py
|
5
2
|
|
6
3
|
import asyncio
|
7
4
|
import aiohttp
|
8
5
|
import json
|
9
6
|
import logging
|
10
|
-
from typing import List, Dict, Union, Optional
|
7
|
+
from typing import List, Dict, Union, Optional, Tuple
|
11
8
|
from dataclasses import dataclass, asdict
|
12
9
|
from urllib.parse import urljoin, urlparse
|
13
10
|
from pathlib import Path
|
@@ -20,6 +17,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskPr
|
|
20
17
|
from rich.console import Console
|
21
18
|
import aiofiles
|
22
19
|
import httpx
|
20
|
+
import requests
|
23
21
|
from multiprocessing import Pool
|
24
22
|
|
25
23
|
console = Console()
|
@@ -88,6 +86,53 @@ class CrawlConfig:
|
|
88
86
|
payload["remove_selectors_regex"] = self.remove_selectors_regex
|
89
87
|
return payload
|
90
88
|
|
89
|
+
# Module level function for multiprocessing
|
90
|
+
def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
91
|
+
"""Process a single URL for parallel processing."""
|
92
|
+
url, base_url, config = args
|
93
|
+
try:
|
94
|
+
endpoint = f"{base_url}/convert"
|
95
|
+
payload = {
|
96
|
+
"url": url,
|
97
|
+
**config.to_dict()
|
98
|
+
}
|
99
|
+
|
100
|
+
response = requests.post(endpoint, json=payload, timeout=config.timeout)
|
101
|
+
if response.status_code != 200:
|
102
|
+
return CrawlResult(
|
103
|
+
url=url,
|
104
|
+
status="failed",
|
105
|
+
error=f"HTTP {response.status_code}: {response.text}",
|
106
|
+
config=config.to_dict()
|
107
|
+
)
|
108
|
+
|
109
|
+
markdown = response.text
|
110
|
+
|
111
|
+
# Save markdown if output directory is configured
|
112
|
+
if config.output_dir:
|
113
|
+
filepath = config.output_dir / f"{slugify(url)}.md"
|
114
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
115
|
+
f.write(markdown)
|
116
|
+
|
117
|
+
# Add delay if configured
|
118
|
+
if config.request_delay:
|
119
|
+
time.sleep(config.request_delay)
|
120
|
+
|
121
|
+
return CrawlResult(
|
122
|
+
url=url,
|
123
|
+
status="success",
|
124
|
+
markdown=markdown,
|
125
|
+
config=config.to_dict()
|
126
|
+
)
|
127
|
+
|
128
|
+
except Exception as e:
|
129
|
+
return CrawlResult(
|
130
|
+
url=url,
|
131
|
+
status="failed",
|
132
|
+
error=str(e),
|
133
|
+
config=config.to_dict()
|
134
|
+
)
|
135
|
+
|
91
136
|
class SpiderForce4AI:
|
92
137
|
"""Main class for interacting with SpiderForce4AI service."""
|
93
138
|
|
@@ -140,6 +185,25 @@ class SpiderForce4AI:
|
|
140
185
|
except Exception as e:
|
141
186
|
console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
|
142
187
|
|
188
|
+
def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
|
189
|
+
"""Save crawl report synchronously."""
|
190
|
+
report = {
|
191
|
+
"timestamp": datetime.now().isoformat(),
|
192
|
+
"config": config.to_dict(),
|
193
|
+
"results": {
|
194
|
+
"successful": [asdict(r) for r in results if r.status == "success"],
|
195
|
+
"failed": [asdict(r) for r in results if r.status == "failed"]
|
196
|
+
},
|
197
|
+
"summary": {
|
198
|
+
"total": len(results),
|
199
|
+
"successful": len([r for r in results if r.status == "success"]),
|
200
|
+
"failed": len([r for r in results if r.status == "failed"])
|
201
|
+
}
|
202
|
+
}
|
203
|
+
|
204
|
+
with open(config.report_file, 'w', encoding='utf-8') as f:
|
205
|
+
json.dump(report, f, indent=2)
|
206
|
+
|
143
207
|
async def _save_report(self, config: CrawlConfig):
|
144
208
|
"""Save crawl report to JSON file."""
|
145
209
|
if not config.report_file:
|
@@ -286,28 +350,8 @@ class SpiderForce4AI:
|
|
286
350
|
"""Synchronous version of crawl_sitemap_async."""
|
287
351
|
return asyncio.run(self.crawl_sitemap_async(sitemap_url, config))
|
288
352
|
|
289
|
-
async def __aenter__(self):
|
290
|
-
"""Async context manager entry."""
|
291
|
-
await self._ensure_session()
|
292
|
-
return self
|
293
|
-
|
294
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
295
|
-
"""Async context manager exit."""
|
296
|
-
await self._close_session()
|
297
|
-
|
298
|
-
def __enter__(self):
|
299
|
-
"""Sync context manager entry."""
|
300
|
-
return self
|
301
|
-
|
302
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
303
|
-
"""Sync context manager exit."""
|
304
|
-
self._executor.shutdown(wait=True)
|
305
|
-
|
306
|
-
|
307
353
|
def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
|
308
|
-
"""
|
309
|
-
Crawl sitemap URLs in parallel using multiprocessing (no asyncio required).
|
310
|
-
"""
|
354
|
+
"""Crawl sitemap URLs in parallel using multiprocessing (no asyncio required)."""
|
311
355
|
print(f"Fetching sitemap from {sitemap_url}...")
|
312
356
|
|
313
357
|
# Fetch sitemap
|
@@ -329,52 +373,12 @@ class SpiderForce4AI:
|
|
329
373
|
print(f"Error parsing sitemap: {str(e)}")
|
330
374
|
raise
|
331
375
|
|
332
|
-
|
333
|
-
|
334
|
-
endpoint = f"{self.base_url}/convert"
|
335
|
-
payload = {
|
336
|
-
"url": url,
|
337
|
-
**config.to_dict()
|
338
|
-
}
|
339
|
-
|
340
|
-
response = requests.post(endpoint, json=payload, timeout=config.timeout)
|
341
|
-
if response.status_code != 200:
|
342
|
-
return CrawlResult(
|
343
|
-
url=url,
|
344
|
-
status="failed",
|
345
|
-
error=f"HTTP {response.status_code}: {response.text}",
|
346
|
-
config=config.to_dict()
|
347
|
-
)
|
348
|
-
|
349
|
-
markdown = response.text
|
350
|
-
|
351
|
-
# Save markdown if output directory is configured
|
352
|
-
if config.output_dir:
|
353
|
-
filepath = config.output_dir / f"{slugify(url)}.md"
|
354
|
-
with open(filepath, 'w', encoding='utf-8') as f:
|
355
|
-
f.write(markdown)
|
356
|
-
|
357
|
-
# Add delay if configured
|
358
|
-
if config.request_delay:
|
359
|
-
time.sleep(config.request_delay)
|
360
|
-
|
361
|
-
return CrawlResult(
|
362
|
-
url=url,
|
363
|
-
status="success",
|
364
|
-
markdown=markdown,
|
365
|
-
config=config.to_dict()
|
366
|
-
)
|
367
|
-
|
368
|
-
except Exception as e:
|
369
|
-
return CrawlResult(
|
370
|
-
url=url,
|
371
|
-
status="failed",
|
372
|
-
error=str(e),
|
373
|
-
config=config.to_dict()
|
374
|
-
)
|
376
|
+
# Prepare arguments for parallel processing
|
377
|
+
process_args = [(url, self.base_url, config) for url in urls]
|
375
378
|
|
376
379
|
# Create process pool and execute crawls
|
377
380
|
results = []
|
381
|
+
|
378
382
|
with Pool(processes=config.max_concurrent_requests) as pool:
|
379
383
|
with Progress(
|
380
384
|
SpinnerColumn(),
|
@@ -385,7 +389,7 @@ class SpiderForce4AI:
|
|
385
389
|
) as progress:
|
386
390
|
task = progress.add_task("Crawling URLs...", total=len(urls))
|
387
391
|
|
388
|
-
for result in pool.imap_unordered(
|
392
|
+
for result in pool.imap_unordered(_process_url_parallel, process_args):
|
389
393
|
results.append(result)
|
390
394
|
progress.update(task, advance=1)
|
391
395
|
status = "✓" if result.status == "success" else "✗"
|
@@ -405,21 +409,19 @@ class SpiderForce4AI:
|
|
405
409
|
|
406
410
|
return results
|
407
411
|
|
408
|
-
def
|
409
|
-
"""
|
410
|
-
|
411
|
-
|
412
|
-
"config": config.to_dict(),
|
413
|
-
"results": {
|
414
|
-
"successful": [asdict(r) for r in results if r.status == "success"],
|
415
|
-
"failed": [asdict(r) for r in results if r.status == "failed"]
|
416
|
-
},
|
417
|
-
"summary": {
|
418
|
-
"total": len(results),
|
419
|
-
"successful": len([r for r in results if r.status == "success"]),
|
420
|
-
"failed": len([r for r in results if r.status == "failed"])
|
421
|
-
}
|
422
|
-
}
|
412
|
+
async def __aenter__(self):
|
413
|
+
"""Async context manager entry."""
|
414
|
+
await self._ensure_session()
|
415
|
+
return self
|
423
416
|
|
424
|
-
|
425
|
-
|
417
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
418
|
+
"""Async context manager exit."""
|
419
|
+
await self._close_session()
|
420
|
+
|
421
|
+
def __enter__(self):
|
422
|
+
"""Sync context manager entry."""
|
423
|
+
return self
|
424
|
+
|
425
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
426
|
+
"""Sync context manager exit."""
|
427
|
+
self._executor.shutdown(wait=True)
|
@@ -0,0 +1,5 @@
|
|
1
|
+
spiderforce4ai/__init__.py,sha256=i1lHYILqFG_Eld0ZCbBdK5F_Jk0zYr_60vS46AYZfTM,16496
|
2
|
+
spiderforce4ai-0.1.5.dist-info/METADATA,sha256=Fm5H-qr4CBfJAVKXyJXsABYib_Vhvn2iUb6T6qSidHg,6214
|
3
|
+
spiderforce4ai-0.1.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
+
spiderforce4ai-0.1.5.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
+
spiderforce4ai-0.1.5.dist-info/RECORD,,
|
@@ -1,5 +0,0 @@
|
|
1
|
-
spiderforce4ai/__init__.py,sha256=ZWt8m5r5tWmjHNE4x45yI-k522_tVCUvEPth-3Yulfg,16633
|
2
|
-
spiderforce4ai-0.1.4.dist-info/METADATA,sha256=olJX54IVWgw92JpagtLnH_wOERNSuBWXbOjw8uSTFq4,6214
|
3
|
-
spiderforce4ai-0.1.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
4
|
-
spiderforce4ai-0.1.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
|
5
|
-
spiderforce4ai-0.1.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|