spiderforce4ai 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,13 +1,10 @@
1
- """
2
- SpiderForce4AI Python Wrapper
3
- A Python package for interacting with SpiderForce4AI HTML-to-Markdown conversion service.
4
- """
1
+ # spiderforce4ai/__init__.py
5
2
 
6
3
  import asyncio
7
4
  import aiohttp
8
5
  import json
9
6
  import logging
10
- from typing import List, Dict, Union, Optional
7
+ from typing import List, Dict, Union, Optional, Tuple
11
8
  from dataclasses import dataclass, asdict
12
9
  from urllib.parse import urljoin, urlparse
13
10
  from pathlib import Path
@@ -20,6 +17,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskPr
20
17
  from rich.console import Console
21
18
  import aiofiles
22
19
  import httpx
20
+ import requests
23
21
  from multiprocessing import Pool
24
22
 
25
23
  console = Console()
@@ -88,6 +86,53 @@ class CrawlConfig:
88
86
  payload["remove_selectors_regex"] = self.remove_selectors_regex
89
87
  return payload
90
88
 
89
+ # Module level function for multiprocessing
90
+ def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
91
+ """Process a single URL for parallel processing."""
92
+ url, base_url, config = args
93
+ try:
94
+ endpoint = f"{base_url}/convert"
95
+ payload = {
96
+ "url": url,
97
+ **config.to_dict()
98
+ }
99
+
100
+ response = requests.post(endpoint, json=payload, timeout=config.timeout)
101
+ if response.status_code != 200:
102
+ return CrawlResult(
103
+ url=url,
104
+ status="failed",
105
+ error=f"HTTP {response.status_code}: {response.text}",
106
+ config=config.to_dict()
107
+ )
108
+
109
+ markdown = response.text
110
+
111
+ # Save markdown if output directory is configured
112
+ if config.output_dir:
113
+ filepath = config.output_dir / f"{slugify(url)}.md"
114
+ with open(filepath, 'w', encoding='utf-8') as f:
115
+ f.write(markdown)
116
+
117
+ # Add delay if configured
118
+ if config.request_delay:
119
+ time.sleep(config.request_delay)
120
+
121
+ return CrawlResult(
122
+ url=url,
123
+ status="success",
124
+ markdown=markdown,
125
+ config=config.to_dict()
126
+ )
127
+
128
+ except Exception as e:
129
+ return CrawlResult(
130
+ url=url,
131
+ status="failed",
132
+ error=str(e),
133
+ config=config.to_dict()
134
+ )
135
+
91
136
  class SpiderForce4AI:
92
137
  """Main class for interacting with SpiderForce4AI service."""
93
138
 
@@ -140,6 +185,25 @@ class SpiderForce4AI:
140
185
  except Exception as e:
141
186
  console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")
142
187
 
188
+ def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
189
+ """Save crawl report synchronously."""
190
+ report = {
191
+ "timestamp": datetime.now().isoformat(),
192
+ "config": config.to_dict(),
193
+ "results": {
194
+ "successful": [asdict(r) for r in results if r.status == "success"],
195
+ "failed": [asdict(r) for r in results if r.status == "failed"]
196
+ },
197
+ "summary": {
198
+ "total": len(results),
199
+ "successful": len([r for r in results if r.status == "success"]),
200
+ "failed": len([r for r in results if r.status == "failed"])
201
+ }
202
+ }
203
+
204
+ with open(config.report_file, 'w', encoding='utf-8') as f:
205
+ json.dump(report, f, indent=2)
206
+
143
207
  async def _save_report(self, config: CrawlConfig):
144
208
  """Save crawl report to JSON file."""
145
209
  if not config.report_file:
@@ -286,28 +350,8 @@ class SpiderForce4AI:
286
350
  """Synchronous version of crawl_sitemap_async."""
287
351
  return asyncio.run(self.crawl_sitemap_async(sitemap_url, config))
288
352
 
289
- async def __aenter__(self):
290
- """Async context manager entry."""
291
- await self._ensure_session()
292
- return self
293
-
294
- async def __aexit__(self, exc_type, exc_val, exc_tb):
295
- """Async context manager exit."""
296
- await self._close_session()
297
-
298
- def __enter__(self):
299
- """Sync context manager entry."""
300
- return self
301
-
302
- def __exit__(self, exc_type, exc_val, exc_tb):
303
- """Sync context manager exit."""
304
- self._executor.shutdown(wait=True)
305
-
306
-
307
353
  def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
308
- """
309
- Crawl sitemap URLs in parallel using multiprocessing (no asyncio required).
310
- """
354
+ """Crawl sitemap URLs in parallel using multiprocessing (no asyncio required)."""
311
355
  print(f"Fetching sitemap from {sitemap_url}...")
312
356
 
313
357
  # Fetch sitemap
@@ -329,52 +373,12 @@ class SpiderForce4AI:
329
373
  print(f"Error parsing sitemap: {str(e)}")
330
374
  raise
331
375
 
332
- def _crawl_single(url: str) -> CrawlResult:
333
- try:
334
- endpoint = f"{self.base_url}/convert"
335
- payload = {
336
- "url": url,
337
- **config.to_dict()
338
- }
339
-
340
- response = requests.post(endpoint, json=payload, timeout=config.timeout)
341
- if response.status_code != 200:
342
- return CrawlResult(
343
- url=url,
344
- status="failed",
345
- error=f"HTTP {response.status_code}: {response.text}",
346
- config=config.to_dict()
347
- )
348
-
349
- markdown = response.text
350
-
351
- # Save markdown if output directory is configured
352
- if config.output_dir:
353
- filepath = config.output_dir / f"{slugify(url)}.md"
354
- with open(filepath, 'w', encoding='utf-8') as f:
355
- f.write(markdown)
356
-
357
- # Add delay if configured
358
- if config.request_delay:
359
- time.sleep(config.request_delay)
360
-
361
- return CrawlResult(
362
- url=url,
363
- status="success",
364
- markdown=markdown,
365
- config=config.to_dict()
366
- )
367
-
368
- except Exception as e:
369
- return CrawlResult(
370
- url=url,
371
- status="failed",
372
- error=str(e),
373
- config=config.to_dict()
374
- )
376
+ # Prepare arguments for parallel processing
377
+ process_args = [(url, self.base_url, config) for url in urls]
375
378
 
376
379
  # Create process pool and execute crawls
377
380
  results = []
381
+
378
382
  with Pool(processes=config.max_concurrent_requests) as pool:
379
383
  with Progress(
380
384
  SpinnerColumn(),
@@ -385,7 +389,7 @@ class SpiderForce4AI:
385
389
  ) as progress:
386
390
  task = progress.add_task("Crawling URLs...", total=len(urls))
387
391
 
388
- for result in pool.imap_unordered(_crawl_single, urls):
392
+ for result in pool.imap_unordered(_process_url_parallel, process_args):
389
393
  results.append(result)
390
394
  progress.update(task, advance=1)
391
395
  status = "✓" if result.status == "success" else "✗"
@@ -405,21 +409,19 @@ class SpiderForce4AI:
405
409
 
406
410
  return results
407
411
 
408
- def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
409
- """Save crawl report synchronously."""
410
- report = {
411
- "timestamp": datetime.now().isoformat(),
412
- "config": config.to_dict(),
413
- "results": {
414
- "successful": [asdict(r) for r in results if r.status == "success"],
415
- "failed": [asdict(r) for r in results if r.status == "failed"]
416
- },
417
- "summary": {
418
- "total": len(results),
419
- "successful": len([r for r in results if r.status == "success"]),
420
- "failed": len([r for r in results if r.status == "failed"])
421
- }
422
- }
412
+ async def __aenter__(self):
413
+ """Async context manager entry."""
414
+ await self._ensure_session()
415
+ return self
423
416
 
424
- with open(config.report_file, 'w', encoding='utf-8') as f:
425
- json.dump(report, f, indent=2)
417
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
418
+ """Async context manager exit."""
419
+ await self._close_session()
420
+
421
+ def __enter__(self):
422
+ """Sync context manager entry."""
423
+ return self
424
+
425
+ def __exit__(self, exc_type, exc_val, exc_tb):
426
+ """Sync context manager exit."""
427
+ self._executor.shutdown(wait=True)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: spiderforce4ai
3
- Version: 0.1.4
3
+ Version: 0.1.5
4
4
  Summary: Python wrapper for SpiderForce4AI HTML-to-Markdown conversion service
5
5
  Home-page: https://petertam.pro
6
6
  Author: Piotr Tamulewicz
@@ -0,0 +1,5 @@
1
+ spiderforce4ai/__init__.py,sha256=i1lHYILqFG_Eld0ZCbBdK5F_Jk0zYr_60vS46AYZfTM,16496
2
+ spiderforce4ai-0.1.5.dist-info/METADATA,sha256=Fm5H-qr4CBfJAVKXyJXsABYib_Vhvn2iUb6T6qSidHg,6214
3
+ spiderforce4ai-0.1.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
+ spiderforce4ai-0.1.5.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
+ spiderforce4ai-0.1.5.dist-info/RECORD,,
@@ -1,5 +0,0 @@
1
- spiderforce4ai/__init__.py,sha256=ZWt8m5r5tWmjHNE4x45yI-k522_tVCUvEPth-3Yulfg,16633
2
- spiderforce4ai-0.1.4.dist-info/METADATA,sha256=olJX54IVWgw92JpagtLnH_wOERNSuBWXbOjw8uSTFq4,6214
3
- spiderforce4ai-0.1.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
4
- spiderforce4ai-0.1.4.dist-info/top_level.txt,sha256=Kth7A21Js7DCp0j5XBBi-FE45SCLouZkeNZU__Yr9Yk,15
5
- spiderforce4ai-0.1.4.dist-info/RECORD,,