spiderforce4ai 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,10 @@
1
- """
2
- SpiderForce4AI Python Wrapper
3
- A Python package for interacting with SpiderForce4AI HTML-to-Markdown conversion service.
4
- """
1
+ # spiderforce4ai/__init__.py
5
2
 
6
3
  import asyncio
7
4
  import aiohttp
8
5
  import json
9
6
  import logging
10
- from typing import List, Dict, Union, Optional
7
+ from typing import List, Dict, Union, Optional, Tuple
11
8
  from dataclasses import dataclass, asdict
12
9
  from urllib.parse import urljoin, urlparse
13
10
  from pathlib import Path
@@ -20,6 +17,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskPr
20
17
  from rich.console import Console
21
18
  import aiofiles
22
19
  import httpx
20
+ import requests
23
21
  from multiprocessing import Pool
24
22
 
25
23
  console = Console()
@@ -88,6 +86,53 @@ class CrawlConfig:
88
86
  payload["remove_selectors_regex"] = self.remove_selectors_regex
89
87
  return payload
90
88
 
89
+ # Module level function for multiprocessing
90
+ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
91
+ """Process a single URL for parallel processing."""
92
+ url, base_url, config = args
93
+ try:
94
+ endpoint = f"{base_url}/convert"
95
+ payload = {
96
+ "url": url,
97
+ **config.to_dict()
98
+ }
99
+
100
+ response = requests.post(endpoint, json=payload, timeout=config.timeout)
101
+ if response.status_code != 200:
102
+ return CrawlResult(
103
+ url=url,
104
+ status="failed",
105
+ error=f"HTTP {response.status_code}: {response.text}",
106
+ config=config.to_dict()
107
+ )
108
+
109
+ markdown = response.text
110
+
111
+ # Save markdown if output directory is configured
112
+ if config.output_dir:
113
+ filepath = config.output_dir / f"{slugify(url)}.md"
114
+ with open(filepath, 'w', encoding='utf-8') as f:
115
+ f.write(markdown)
116
+
117
+ # Add delay if configured
118
+ if config.request_delay:
119
+ time.sleep(config.request_delay)
120
+
121
+ return CrawlResult(
122
+ url=url,
123
+ status="success",
124
+ markdown=markdown,
125
+ config=config.to_dict()
126
+ )
127
+
128
+ except Exception as e:
129
+ return CrawlResult(
130
+ url=url,
131
+ status="failed",
132
+ error=str(e),
133
+ config=config.to_dict()
134
+ )
135
+
91
136
  class SpiderForce4AI:
92
137
  """Main class for interacting with SpiderForce4AI service."""
93
138
 
@@ -140,6 +185,25 @@ class SpiderForce4AI:
140
185
  except Exception as e:
141
186
  console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
142
187
 
188
+ def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
189
+ """Save crawl report synchronously."""
190
+ report = {
191
+ "timestamp": datetime.now().isoformat(),
192
+ "config": config.to_dict(),
193
+ "results": {
194
+ "successful": [asdict(r) for r in results if r.status == "success"],
195
+ "failed": [asdict(r) for r in results if r.status == "failed"]
196
+ },
197
+ "summary": {
198
+ "total": len(results),
199
+ "successful": len([r for r in results if r.status == "success"]),
200
+ "failed": len([r for r in results if r.status == "failed"])
201
+ }
202
+ }
203
+
204
+ with open(config.report_file, 'w', encoding='utf-8') as f:
205
+ json.dump(report, f, indent=2)
206
+
143
207
  async def _save_report(self, config: CrawlConfig):
144
208
  """Save crawl report to JSON file."""
145
209
  if not config.report_file:
@@ -286,28 +350,8 @@ class SpiderForce4AI:
286
350
  """Synchronous version of crawl_sitemap_async."""
287
351
  return asyncio.run(self.crawl_sitemap_async(sitemap_url, config))
288
352
 
289
- async def __aenter__(self):
290
- """Async context manager entry."""
291
- await self._ensure_session()
292
- return self
293
-
294
- async def __aexit__(self, exc_type, exc_val, exc_tb):
295
- """Async context manager exit."""
296
- await self._close_session()
297
-
298
- def __enter__(self):
299
- """Sync context manager entry."""
300
- return self
301
-
302
- def __exit__(self, exc_type, exc_val, exc_tb):
303
- """Sync context manager exit."""
304
- self._executor.shutdown(wait=True)
305
-
306
-
307
353
  def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
308
- """
309
- Crawl sitemap URLs in parallel using multiprocessing (no asyncio required).
310
- """
354
+ """Crawl sitemap URLs in parallel using multiprocessing (no asyncio required)."""
311
355
  print(f"Fetching sitemap from {sitemap_url}...")
312
356
 
313
357
  # Fetch sitemap
@@ -329,52 +373,12 @@ class SpiderForce4AI:
329
373
  print(f"Error parsing sitemap: {str(e)}")
330
374
  raise
331
375
 
332
- def _crawl_single(url: str) -> CrawlResult:
333
- try:
334
- endpoint = f"{self.base_url}/convert"
335
- payload = {
336
- "url": url,
337
- **config.to_dict()
338
- }
339
-
340
- response = requests.post(endpoint, json=payload, timeout=config.timeout)
341
- if response.status_code != 200:
342
- return CrawlResult(
343
- url=url,
344
- status="failed",
345
- error=f"HTTP {response.status_code}: {response.text}",
346
- config=config.to_dict()
347
- )
348
-
349
- markdown = response.text
350
-
351
- # Save markdown if output directory is configured
352
- if config.output_dir:
353
- filepath = config.output_dir / f"{slugify(url)}.md"
354
- with open(filepath, 'w', encoding='utf-8') as f:
355
- f.write(markdown)
356
-
357
- # Add delay if configured
358
- if config.request_delay:
359
- time.sleep(config.request_delay)
360
-
361
- return CrawlResult(
362
- url=url,
363
- status="success",
364
- markdown=markdown,
365
- config=config.to_dict()
366
- )
367
-
368
- except Exception as e:
369
- return CrawlResult(
370
- url=url,
371
- status="failed",
372
- error=str(e),
373
- config=config.to_dict()
374
- )
376
+ # Prepare arguments for parallel processing
377
+ process_args = [(url, self.base_url, config) for url in urls]
375
378
 
376
379
  # Create process pool and execute crawls
377
380
  results = []
381
+
378
382
  with Pool(processes=config.max_concurrent_requests) as pool:
379
383
  with Progress(
380
384
  SpinnerColumn(),
@@ -385,7 +389,7 @@ class SpiderForce4AI:
385
389
  ) as progress:
386
390
  task = progress.add_task("Crawling URLs...", total=len(urls))
387
391
 
388
- for result in pool.imap_unordered(_crawl_single, urls):
392
+ for result in pool.imap_unordered(_process_url_parallel, process_args):
389
393
  results.append(result)
390
394
  progress.update(task, advance=1)
391
395
  status = "✓" if result.status == "success" else "✗"
@@ -405,21 +409,19 @@ class SpiderForce4AI:
405
409
 
406
410
  return results
407
411
 
408
- def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
409
- """Save crawl report synchronously."""
410
- report = {
411
- "timestamp": datetime.now().isoformat(),
412
- "config": config.to_dict(),
413
- "results": {
414
- "successful": [asdict(r) for r in results if r.status == "success"],
415
- "failed": [asdict(r) for r in results if r.status == "failed"]
416
- },
417
- "summary": {
418
- "total": len(results),
419
- "successful": len([r for r in results if r.status == "success"]),
420
- "failed": len([r for r in results if r.status == "failed"])
421
- }
422
- }
412
+ async def __aenter__(self):
413
+ """Async context manager entry."""
414
+ await self._ensure_session()
415
+ return self
423
416
 
424
- with open(config.report_file, 'w', encoding='utf-8') as f:
425
- json.dump(report, f, indent=2)
417
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
418
+ """Async context manager exit."""
419
+ await self._close_session()
420
+
421
+ def __enter__(self):
422
+ """Sync context manager entry."""
423
+ return self
424
+
425
+ def __exit__(self, exc_type, exc_val, exc_tb):
426
+ """Sync context manager exit."""
427
+ self._executor.shutdown(wait=True)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 0.1.4
3
+ Version: 0.1.5
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,5 @@
1
+ spiderforce4ai/__init__.py,sha256=i1lHYILqFG_Eld0ZCbBdK5F_Jk0zYr_60vS46AYZfTM,16496
2
+ spiderforce4ai-0.1.5.dist-info/METADATA,sha256=Fm5H-qr4CBfJAVKXyJXsABYib_Vhvn2iUb6T6qSidHg,6214
3
+ spiderforce4ai-0.1.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
+ spiderforce4ai-0.1.5.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
+ spiderforce4ai-0.1.5.dist-info/RECORD,,
@@ -1,5 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=ZWt8m5r5tWmjHNE4x45yI-k522_tVCUvEPth-3Yulfg,16633
2
- spiderforce4ai-0.1.4.dist-info/METADATA,sha256=olJX54IVWgw92JpagtLnH_wOERNSuBWXbOjw8uSTFq4,6214
3
- spiderforce4ai-0.1.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
- spiderforce4ai-0.1.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
- spiderforce4ai-0.1.4.dist-info/RECORD,,