spiderforce4ai 0.1.3__tar.gz → 0.1.5__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {spiderforce4ai-0.1.3 → spiderforce4ai-0.1.5}/PKG-INFO +1 -1
- {spiderforce4ai-0.1.3 → spiderforce4ai-0.1.5}/pyproject.toml +1 -1
- {spiderforce4ai-0.1.3 → spiderforce4ai-0.1.5}/setup.py +1 -1
- {spiderforce4ai-0.1.3 → spiderforce4ai-0.1.5}/spiderforce4ai/__init__.py +89 -87
- {spiderforce4ai-0.1.3 → spiderforce4ai-0.1.5}/spiderforce4ai.egg-info/PKG-INFO +1 -1
- {spiderforce4ai-0.1.3 → spiderforce4ai-0.1.5}/README.md +0 -0
- {spiderforce4ai-0.1.3 → spiderforce4ai-0.1.5}/setup.cfg +0 -0
- {spiderforce4ai-0.1.3 → spiderforce4ai-0.1.5}/spiderforce4ai.egg-info/SOURCES.txt +0 -0
- {spiderforce4ai-0.1.3 → spiderforce4ai-0.1.5}/spiderforce4ai.egg-info/dependency_links.txt +0 -0
- {spiderforce4ai-0.1.3 → spiderforce4ai-0.1.5}/spiderforce4ai.egg-info/requires.txt +0 -0
- {spiderforce4ai-0.1.3 → spiderforce4ai-0.1.5}/spiderforce4ai.egg-info/top_level.txt +0 -0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "spiderforce4ai"
|
7
|
-
version = "0.1.
|
7
|
+
version = "0.1.5"
|
8
8
|
description = "Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service"
|
9
9
|
readme = "README.md"
|
10
10
|
authors = [{name = "Piotr Tamulewicz", email = "pt@petertam.pro"}]
|
@@ -1,13 +1,10 @@
|
|
1
|
-
|
2
|
-
SpiderForce4AI Python Wrapper
|
3
|
-
A Python package for interacting with SpiderForce4AI HTML-to-Markdown conversion service.
|
4
|
-
"""
|
1
|
+
# spiderforce4ai/__init__.py
|
5
2
|
|
6
3
|
import asyncio
|
7
4
|
import aiohttp
|
8
5
|
import json
|
9
6
|
import logging
|
10
|
-
from typing import List, Dict, Union, Optional
|
7
|
+
from typing import List, Dict, Union, Optional, Tuple
|
11
8
|
from dataclasses import dataclass, asdict
|
12
9
|
from urllib.parse import urljoin, urlparse
|
13
10
|
from pathlib import Path
|
@@ -20,6 +17,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskPr
|
|
20
17
|
from rich.console import Console
|
21
18
|
import aiofiles
|
22
19
|
import httpx
|
20
|
+
import requests
|
23
21
|
from multiprocessing import Pool
|
24
22
|
|
25
23
|
console = Console()
|
@@ -88,6 +86,53 @@ class CrawlConfig:
|
|
88
86
|
payload["remove_selectors_regex"] = self.remove_selectors_regex
|
89
87
|
return payload
|
90
88
|
|
89
|
+
# Module level function for multiprocessing
|
90
|
+
def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
|
91
|
+
"""Process a single URL for parallel processing."""
|
92
|
+
url, base_url, config = args
|
93
|
+
try:
|
94
|
+
endpoint = f"{base_url}/convert"
|
95
|
+
payload = {
|
96
|
+
"url": url,
|
97
|
+
**config.to_dict()
|
98
|
+
}
|
99
|
+
|
100
|
+
response = requests.post(endpoint, json=payload, timeout=config.timeout)
|
101
|
+
if response.status_code != 200:
|
102
|
+
return CrawlResult(
|
103
|
+
url=url,
|
104
|
+
status="failed",
|
105
|
+
error=f"HTTP {response.status_code}: {response.text}",
|
106
|
+
config=config.to_dict()
|
107
|
+
)
|
108
|
+
|
109
|
+
markdown = response.text
|
110
|
+
|
111
|
+
# Save markdown if output directory is configured
|
112
|
+
if config.output_dir:
|
113
|
+
filepath = config.output_dir / f"{slugify(url)}.md"
|
114
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
115
|
+
f.write(markdown)
|
116
|
+
|
117
|
+
# Add delay if configured
|
118
|
+
if config.request_delay:
|
119
|
+
time.sleep(config.request_delay)
|
120
|
+
|
121
|
+
return CrawlResult(
|
122
|
+
url=url,
|
123
|
+
status="success",
|
124
|
+
markdown=markdown,
|
125
|
+
config=config.to_dict()
|
126
|
+
)
|
127
|
+
|
128
|
+
except Exception as e:
|
129
|
+
return CrawlResult(
|
130
|
+
url=url,
|
131
|
+
status="failed",
|
132
|
+
error=str(e),
|
133
|
+
config=config.to_dict()
|
134
|
+
)
|
135
|
+
|
91
136
|
class SpiderForce4AI:
|
92
137
|
"""Main class for interacting with SpiderForce4AI service."""
|
93
138
|
|
@@ -140,6 +185,25 @@ class SpiderForce4AI:
|
|
140
185
|
except Exception as e:
|
141
186
|
console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
|
142
187
|
|
188
|
+
def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
|
189
|
+
"""Save crawl report synchronously."""
|
190
|
+
report = {
|
191
|
+
"timestamp": datetime.now().isoformat(),
|
192
|
+
"config": config.to_dict(),
|
193
|
+
"results": {
|
194
|
+
"successful": [asdict(r) for r in results if r.status == "success"],
|
195
|
+
"failed": [asdict(r) for r in results if r.status == "failed"]
|
196
|
+
},
|
197
|
+
"summary": {
|
198
|
+
"total": len(results),
|
199
|
+
"successful": len([r for r in results if r.status == "success"]),
|
200
|
+
"failed": len([r for r in results if r.status == "failed"])
|
201
|
+
}
|
202
|
+
}
|
203
|
+
|
204
|
+
with open(config.report_file, 'w', encoding='utf-8') as f:
|
205
|
+
json.dump(report, f, indent=2)
|
206
|
+
|
143
207
|
async def _save_report(self, config: CrawlConfig):
|
144
208
|
"""Save crawl report to JSON file."""
|
145
209
|
if not config.report_file:
|
@@ -286,28 +350,8 @@ class SpiderForce4AI:
|
|
286
350
|
"""Synchronous version of crawl_sitemap_async."""
|
287
351
|
return asyncio.run(self.crawl_sitemap_async(sitemap_url, config))
|
288
352
|
|
289
|
-
async def __aenter__(self):
|
290
|
-
"""Async context manager entry."""
|
291
|
-
await self._ensure_session()
|
292
|
-
return self
|
293
|
-
|
294
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
295
|
-
"""Async context manager exit."""
|
296
|
-
await self._close_session()
|
297
|
-
|
298
|
-
def __enter__(self):
|
299
|
-
"""Sync context manager entry."""
|
300
|
-
return self
|
301
|
-
|
302
|
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
303
|
-
"""Sync context manager exit."""
|
304
|
-
self._executor.shutdown(wait=True)
|
305
|
-
|
306
|
-
|
307
353
|
def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
|
308
|
-
"""
|
309
|
-
Crawl sitemap URLs in parallel using multiprocessing (no asyncio required).
|
310
|
-
"""
|
354
|
+
"""Crawl sitemap URLs in parallel using multiprocessing (no asyncio required)."""
|
311
355
|
print(f"Fetching sitemap from {sitemap_url}...")
|
312
356
|
|
313
357
|
# Fetch sitemap
|
@@ -329,52 +373,12 @@ class SpiderForce4AI:
|
|
329
373
|
print(f"Error parsing sitemap: {str(e)}")
|
330
374
|
raise
|
331
375
|
|
332
|
-
|
333
|
-
|
334
|
-
endpoint = f"{self.base_url}/convert"
|
335
|
-
payload = {
|
336
|
-
"url": url,
|
337
|
-
**config.to_dict()
|
338
|
-
}
|
339
|
-
|
340
|
-
response = requests.post(endpoint, json=payload, timeout=config.timeout)
|
341
|
-
if response.status_code != 200:
|
342
|
-
return CrawlResult(
|
343
|
-
url=url,
|
344
|
-
status="failed",
|
345
|
-
error=f"HTTP {response.status_code}: {response.text}",
|
346
|
-
config=config.to_dict()
|
347
|
-
)
|
348
|
-
|
349
|
-
markdown = response.text
|
350
|
-
|
351
|
-
# Save markdown if output directory is configured
|
352
|
-
if config.output_dir:
|
353
|
-
filepath = config.output_dir / f"{slugify(url)}.md"
|
354
|
-
with open(filepath, 'w', encoding='utf-8') as f:
|
355
|
-
f.write(markdown)
|
356
|
-
|
357
|
-
# Add delay if configured
|
358
|
-
if config.request_delay:
|
359
|
-
time.sleep(config.request_delay)
|
360
|
-
|
361
|
-
return CrawlResult(
|
362
|
-
url=url,
|
363
|
-
status="success",
|
364
|
-
markdown=markdown,
|
365
|
-
config=config.to_dict()
|
366
|
-
)
|
367
|
-
|
368
|
-
except Exception as e:
|
369
|
-
return CrawlResult(
|
370
|
-
url=url,
|
371
|
-
status="failed",
|
372
|
-
error=str(e),
|
373
|
-
config=config.to_dict()
|
374
|
-
)
|
376
|
+
# Prepare arguments for parallel processing
|
377
|
+
process_args = [(url, self.base_url, config) for url in urls]
|
375
378
|
|
376
379
|
# Create process pool and execute crawls
|
377
380
|
results = []
|
381
|
+
|
378
382
|
with Pool(processes=config.max_concurrent_requests) as pool:
|
379
383
|
with Progress(
|
380
384
|
SpinnerColumn(),
|
@@ -385,7 +389,7 @@ class SpiderForce4AI:
|
|
385
389
|
) as progress:
|
386
390
|
task = progress.add_task("Crawling URLs...", total=len(urls))
|
387
391
|
|
388
|
-
for result in pool.imap_unordered(
|
392
|
+
for result in pool.imap_unordered(_process_url_parallel, process_args):
|
389
393
|
results.append(result)
|
390
394
|
progress.update(task, advance=1)
|
391
395
|
status = "✓" if result.status == "success" else "✗"
|
@@ -405,21 +409,19 @@ class SpiderForce4AI:
|
|
405
409
|
|
406
410
|
return results
|
407
411
|
|
408
|
-
def
|
409
|
-
"""
|
410
|
-
|
411
|
-
|
412
|
-
"config": config.to_dict(),
|
413
|
-
"results": {
|
414
|
-
"successful": [asdict(r) for r in results if r.status == "success"],
|
415
|
-
"failed": [asdict(r) for r in results if r.status == "failed"]
|
416
|
-
},
|
417
|
-
"summary": {
|
418
|
-
"total": len(results),
|
419
|
-
"successful": len([r for r in results if r.status == "success"]),
|
420
|
-
"failed": len([r for r in results if r.status == "failed"])
|
421
|
-
}
|
422
|
-
}
|
412
|
+
async def __aenter__(self):
|
413
|
+
"""Async context manager entry."""
|
414
|
+
await self._ensure_session()
|
415
|
+
return self
|
423
416
|
|
424
|
-
|
425
|
-
|
417
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
418
|
+
"""Async context manager exit."""
|
419
|
+
await self._close_session()
|
420
|
+
|
421
|
+
def __enter__(self):
|
422
|
+
"""Sync context manager entry."""
|
423
|
+
return self
|
424
|
+
|
425
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
426
|
+
"""Sync context manager exit."""
|
427
|
+
self._executor.shutdown(wait=True)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|