diffbot-python 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
diffbot/client.py ADDED
@@ -0,0 +1,285 @@
1
+ """The Diffbot client classes (sync and async)."""
2
+
3
+ import pathlib
4
+ from types import TracebackType
5
+ from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Sequence, Type, Union
6
+
7
+ import httpx
8
+
9
+ from . import __version__
10
+ from .errors import APIError, AuthError, RateLimitError, ValidationError
11
+ from .extract import extract as _extract, extract_async as _extract_async
12
+ from .ask import ask as _ask, ask_async as _ask_async
13
+ from .crawl import (
14
+ CrawlEvent,
15
+ crawl as _crawl,
16
+ crawl_async as _crawl_async,
17
+ crawl_delete_job as _crawl_delete_job,
18
+ crawl_delete_job_async as _crawl_delete_job_async,
19
+ crawl_get_job as _crawl_get_job,
20
+ crawl_get_job_async as _crawl_get_job_async,
21
+ crawl_list_jobs as _crawl_list_jobs,
22
+ crawl_list_jobs_async as _crawl_list_jobs_async,
23
+ )
24
+ from .kg import (
25
+ dql as _dql,
26
+ dql_async as _dql_async,
27
+ dql_parallel as _dql_parallel,
28
+ dql_refresh_ontology as _dql_refresh_ontology,
29
+ )
30
+ from .web_search import (
31
+ WEB_SEARCH_BASE,
32
+ web_search as _web_search,
33
+ web_search_async as _web_search_async,
34
+ )
35
+ from .nlp import (
36
+ NLP_BASE,
37
+ entities as _entities,
38
+ entities_async as _entities_async,
39
+ )
40
+
41
+ EXTRACT_BASE = "https://api.diffbot.com/v3"
42
+ CRAWL_BASE = "https://api.diffbot.com/v3/crawl"
43
+ DIFFBOT_LLM_BASE = "https://llm.diffbot.com/rag/v1/chat/completions"
44
+ DEFAULT_TIMEOUT = 30.0
45
+
46
+
47
+ class Diffbot:
48
+ """Client for the Diffbot APIs.
49
+
50
+ Example:
51
+ >>> from diffbot import Diffbot
52
+ >>> db = Diffbot(token=os.getenv("DIFFBOT_API_TOKEN"))
53
+ >>> db.extract("https://example.com")
54
+ """
55
+
56
+ def __init__(
57
+ self,
58
+ token: str,
59
+ *,
60
+ timeout: float = DEFAULT_TIMEOUT,
61
+ analyze_url: str = EXTRACT_BASE,
62
+ llm_url: str = DIFFBOT_LLM_BASE,
63
+ crawler_url: str = CRAWL_BASE,
64
+ web_search_url: str = WEB_SEARCH_BASE,
65
+ nlp_url: str = NLP_BASE,
66
+ transport: Optional[httpx.BaseTransport] = None,
67
+ ):
68
+ if not token:
69
+ raise ValidationError("token is required")
70
+ self.token = token
71
+ self.analyze_url = analyze_url
72
+ self.llm_url = llm_url
73
+ self.crawler_url = crawler_url
74
+ self.web_search_url = web_search_url
75
+ self.nlp_url = nlp_url
76
+ self._http = httpx.Client(
77
+ timeout=timeout,
78
+ headers={"User-Agent": f"diffbot-python/{__version__}"},
79
+ transport=transport,
80
+ )
81
+
82
+ def __enter__(self) -> "Diffbot":
83
+ return self
84
+
85
+ def __exit__(
86
+ self,
87
+ exc_type: Optional[Type[BaseException]],
88
+ exc: Optional[BaseException],
89
+ tb: Optional[TracebackType],
90
+ ) -> None:
91
+ self.close()
92
+
93
+ def close(self) -> None:
94
+ self._http.close()
95
+
96
+ def _raise_for_status(self, response: httpx.Response) -> None:
97
+ if response.is_success:
98
+ return
99
+ status = response.status_code
100
+ body = response.text
101
+ if status in (401, 403):
102
+ raise AuthError(status, body)
103
+ if status == 429:
104
+ raise RateLimitError(status, body, retry_after=response.headers.get("retry-after"))
105
+ raise APIError(status, body)
106
+
107
+ def extract(self, url: str, api: str = "analyze", fmt: str = "markdown") -> Dict[str, Any]:
108
+ """Extract structured content from a URL. Returns the raw Diffbot API response."""
109
+ return _extract(self, url, api=api, fmt=fmt)
110
+
111
+ def ask(self, messages: List[Dict[str, str]]) -> Iterator[str]:
112
+ """Stream a response from the Diffbot LLM RAG API."""
113
+ yield from _ask(self, messages)
114
+
115
+ def crawl(self, site: str, **kwargs: Any) -> Iterator[CrawlEvent]:
116
+ """Start a crawl job."""
117
+ yield from _crawl(self, site, **kwargs)
118
+
119
+ def crawl_list_jobs(self) -> List[Dict[str, Any]]:
120
+ """List all crawler jobs for this token."""
121
+ return _crawl_list_jobs(self)
122
+
123
+ def crawl_get_job(self, job_name: str) -> Dict[str, Any]:
124
+ """Get the status of a crawler job."""
125
+ return _crawl_get_job(self, job_name)
126
+
127
+ def crawl_delete_job(self, job_name: str) -> None:
128
+ """Delete a crawler job."""
129
+ _crawl_delete_job(self, job_name)
130
+
131
+ def dql(
132
+ self,
133
+ query: str,
134
+ *,
135
+ size: int = 10,
136
+ from_: int = 0,
137
+ format: str = "json",
138
+ filter: Optional[str] = None,
139
+ exportspec: Optional[str] = None,
140
+ extra: Optional[Dict[str, str]] = None,
141
+ raw: bool = False,
142
+ ) -> Union[Dict[str, Any], bytes]:
143
+ """Run a DQL query against the Diffbot Knowledge Graph.
144
+
145
+ Returns the parsed JSON response, or the raw response bytes when raw=True
146
+ (e.g. to retrieve CSV/export formats undecoded).
147
+ """
148
+ return _dql(self, query, size=size, from_=from_, format=format, filter=filter, exportspec=exportspec, extra=extra, raw=raw)
149
+
150
+ def dql_parallel(self, queries: Sequence[Dict[str, Any]], *, workers: int = 8) -> List[Union[Dict[str, Any], bytes]]:
151
+ """Run multiple DQL queries concurrently. Each item is a dict of dql() keyword args."""
152
+ return _dql_parallel(self, queries, workers=workers)
153
+
154
+ def dql_refresh_ontology(self, dest: pathlib.Path) -> None:
155
+ """Download the Diffbot Knowledge Graph ontology and write it to dest."""
156
+ _dql_refresh_ontology(self, dest)
157
+
158
+ def web_search(self, text: str, *, num_results: Optional[int] = None, max_tokens: Optional[int] = None) -> Dict[str, Any]:
159
+ """Search the web via the Diffbot LLM web search API."""
160
+ return _web_search(self, text, num_results=num_results, max_tokens=max_tokens)
161
+
162
+ def entities(self, text: str, *, lang: str = "auto") -> Dict[str, Any]:
163
+ """Identify and resolve entities and sentiment in text using the Diffbot NLP API.
164
+
165
+ Entity IDs can be looked up in the Knowledge Graph via dql() using
166
+ id:or("id1","id2","id3") — no type: declaration required.
167
+ """
168
+ return _entities(self, text, lang=lang)
169
+
170
+
171
+ class DiffbotAsync:
172
+ """Async client for Diffbot APIs.
173
+
174
+ Example:
175
+ >>> async with DiffbotAsync(token=os.getenv("DIFFBOT_API_TOKEN")) as db:
176
+ ... result = await db.extract("https://example.com")
177
+ """
178
+
179
+ def __init__(
180
+ self,
181
+ token: str,
182
+ *,
183
+ timeout: float = DEFAULT_TIMEOUT,
184
+ analyze_url: str = EXTRACT_BASE,
185
+ llm_url: str = DIFFBOT_LLM_BASE,
186
+ crawler_url: str = CRAWL_BASE,
187
+ web_search_url: str = WEB_SEARCH_BASE,
188
+ nlp_url: str = NLP_BASE,
189
+ transport: Optional[httpx.AsyncBaseTransport] = None,
190
+ ):
191
+ if not token:
192
+ raise ValidationError("token is required")
193
+ self.token = token
194
+ self.analyze_url = analyze_url
195
+ self.llm_url = llm_url
196
+ self.crawler_url = crawler_url
197
+ self.web_search_url = web_search_url
198
+ self.nlp_url = nlp_url
199
+ self._http = httpx.AsyncClient(
200
+ timeout=timeout,
201
+ headers={"User-Agent": f"diffbot-python/{__version__}"},
202
+ transport=transport,
203
+ )
204
+
205
+ async def __aenter__(self) -> "DiffbotAsync":
206
+ return self
207
+
208
+ async def __aexit__(
209
+ self,
210
+ exc_type: Optional[Type[BaseException]],
211
+ exc: Optional[BaseException],
212
+ tb: Optional[TracebackType],
213
+ ) -> None:
214
+ await self.close()
215
+
216
+ async def close(self) -> None:
217
+ await self._http.aclose()
218
+
219
+ def _raise_for_status(self, response: httpx.Response) -> None:
220
+ if response.is_success:
221
+ return
222
+ status = response.status_code
223
+ body = response.text
224
+ if status in (401, 403):
225
+ raise AuthError(status, body)
226
+ if status == 429:
227
+ raise RateLimitError(status, body, retry_after=response.headers.get("retry-after"))
228
+ raise APIError(status, body)
229
+
230
+ async def extract(self, url: str, api: str = "analyze", fmt: str = "markdown") -> Dict[str, Any]:
231
+ """Extract structured content from a URL. Returns the raw Diffbot API response."""
232
+ return await _extract_async(self, url, api=api, fmt=fmt)
233
+
234
+ async def ask(self, messages: List[Dict[str, str]]) -> AsyncIterator[str]:
235
+ """Stream a response from the Diffbot LLM RAG API."""
236
+ async for chunk in _ask_async(self, messages):
237
+ yield chunk
238
+
239
+ async def crawl(self, site: str, **kwargs: Any) -> AsyncIterator[CrawlEvent]:
240
+ """Start a crawl job. Pass watch=True to poll until completion and yield URL_PROCESSED events."""
241
+ async for event in _crawl_async(self, site, **kwargs):
242
+ yield event
243
+
244
+ async def crawl_list_jobs(self) -> List[Dict[str, Any]]:
245
+ """List all crawler jobs for this token."""
246
+ return await _crawl_list_jobs_async(self)
247
+
248
+ async def crawl_get_job(self, job_name: str) -> Dict[str, Any]:
249
+ """Get the status of a crawler job."""
250
+ return await _crawl_get_job_async(self, job_name)
251
+
252
+ async def crawl_delete_job(self, job_name: str) -> None:
253
+ """Delete a crawler job."""
254
+ await _crawl_delete_job_async(self, job_name)
255
+
256
+ async def dql(
257
+ self,
258
+ query: str,
259
+ *,
260
+ size: int = 10,
261
+ from_: int = 0,
262
+ format: str = "json",
263
+ filter: Optional[str] = None,
264
+ exportspec: Optional[str] = None,
265
+ extra: Optional[Dict[str, str]] = None,
266
+ raw: bool = False,
267
+ ) -> Union[Dict[str, Any], bytes]:
268
+ """Run a DQL query against the Diffbot Knowledge Graph.
269
+
270
+ Returns the parsed JSON response, or the raw response bytes when raw=True
271
+ (e.g. to retrieve CSV/export formats undecoded).
272
+ """
273
+ return await _dql_async(self, query, size=size, from_=from_, format=format, filter=filter, exportspec=exportspec, extra=extra, raw=raw)
274
+
275
+ async def web_search(self, text: str, *, num_results: Optional[int] = None, max_tokens: Optional[int] = None) -> Dict[str, Any]:
276
+ """Search the web via the Diffbot LLM web search API."""
277
+ return await _web_search_async(self, text, num_results=num_results, max_tokens=max_tokens)
278
+
279
+ async def entities(self, text: str, *, lang: str = "auto") -> Dict[str, Any]:
280
+ """Identify and resolve entities and sentiment in text using the Diffbot NLP API.
281
+
282
+ Entity IDs can be looked up in the Knowledge Graph via dql() using
283
+ id:or("id1","id2","id3") — no type: declaration required.
284
+ """
285
+ return await _entities_async(self, text, lang=lang)
diffbot/crawl.py ADDED
@@ -0,0 +1,270 @@
1
+ """Diffbot Crawler API: start crawls, manage crawler jobs."""
2
+
3
+ import asyncio
4
+ import csv
5
+ import io
6
+ import time
7
+ from dataclasses import dataclass
8
+ from datetime import datetime
9
+ from enum import Enum
10
+ from typing import TYPE_CHECKING, Any, AsyncIterator, Dict, Iterator, List, Optional
11
+
12
+ from .errors import APIError
13
+
14
+ if TYPE_CHECKING:
15
+ from .client import Diffbot, DiffbotAsync
16
+
17
+
18
+ class CrawlEventType(Enum):
19
+ JOB_CREATED = "job_created"
20
+ URL_PROCESSED = "url_processed"
21
+
22
+
23
+ @dataclass
24
+ class CrawlEvent:
25
+ """An event yielded while a crawl is in flight."""
26
+ event_type: CrawlEventType
27
+ timestamp: str
28
+ details: Dict[str, Any]
29
+
30
+
31
+ def crawl(
32
+ client: "Diffbot",
33
+ site: str,
34
+ hops: int = 2,
35
+ job_name: Optional[str] = None,
36
+ max_to_crawl: int = 100,
37
+ max_to_process: int = 100,
38
+ restrict_domain: bool = True,
39
+ api_url: str = "",
40
+ crawl_delay: float = -1,
41
+ url_crawl_pattern: Optional[str] = None,
42
+ url_process_pattern: Optional[str] = None,
43
+ obey_robots: bool = False,
44
+ use_proxies: bool = False,
45
+ custom_headers: Optional[str] = None,
46
+ watch: bool = False,
47
+ poll_interval: float = 2.0,
48
+ ) -> Iterator[CrawlEvent]:
49
+ """Create a crawler job and yield a JOB_CREATED event. If watch=True, poll until
50
+ completion and yield URL_PROCESSED events for each crawled URL."""
51
+ if not site.startswith("http"):
52
+ site = f"https://{site}"
53
+ if not job_name:
54
+ job_name = f"crawl-{int(time.time())}"
55
+
56
+ params = _build_crawl_params(
57
+ client, job_name, site, max_to_crawl, max_to_process, hops, restrict_domain,
58
+ api_url, crawl_delay, url_crawl_pattern, url_process_pattern,
59
+ obey_robots, use_proxies, custom_headers,
60
+ )
61
+ response = client._http.get(client.crawler_url, params=params)
62
+ client._raise_for_status(response)
63
+
64
+ yield CrawlEvent(
65
+ event_type=CrawlEventType.JOB_CREATED,
66
+ timestamp=datetime.now().isoformat(),
67
+ details={"job_name": job_name},
68
+ )
69
+
70
+ if not watch:
71
+ return
72
+
73
+ seen_urls: set = set()
74
+
75
+ while True:
76
+ status_response = client._http.get(
77
+ client.crawler_url,
78
+ params={"token": client.token, "name": job_name},
79
+ )
80
+ client._raise_for_status(status_response)
81
+ jobs = status_response.json().get("jobs", [])
82
+ if not jobs:
83
+ break
84
+ job_status = jobs[0].get("jobStatus", {})
85
+ status_code = job_status.get("status", 0)
86
+
87
+ urls_response = client._http.get(
88
+ f"{client.crawler_url}/data",
89
+ params={"token": client.token, "name": job_name, "type": "urls"},
90
+ follow_redirects=True,
91
+ )
92
+ client._raise_for_status(urls_response)
93
+ if urls_response.content:
94
+ for event in _parse_url_csv(urls_response.text, seen_urls):
95
+ yield event
96
+
97
+ if status_code not in (0, 7):
98
+ message = job_status.get("message", "")
99
+ if "fail" in message.lower() or "error" in message.lower():
100
+ raise APIError(500, f"Crawler job failed: {message}")
101
+ break
102
+
103
+ time.sleep(poll_interval)
104
+
105
+
106
+ def _build_crawl_params(client: Any, job_name: str, site: str, max_to_crawl: int,
107
+ max_to_process: int, hops: int, restrict_domain: bool,
108
+ api_url: str, crawl_delay: float, url_crawl_pattern: Optional[str],
109
+ url_process_pattern: Optional[str], obey_robots: bool,
110
+ use_proxies: bool, custom_headers: Optional[str]) -> Dict[str, Any]:
111
+ params: Dict[str, Any] = {
112
+ "token": client.token,
113
+ "name": job_name,
114
+ "seeds": site,
115
+ "maxToCrawl": max_to_crawl,
116
+ "maxToProcess": max_to_process,
117
+ "maxHops": hops,
118
+ "restrictDomain": 1 if restrict_domain else 0,
119
+ }
120
+ if api_url:
121
+ params["apiUrl"] = api_url
122
+ if crawl_delay > 0:
123
+ params["crawlDelay"] = crawl_delay
124
+ if url_crawl_pattern:
125
+ params["urlCrawlPattern"] = url_crawl_pattern
126
+ if url_process_pattern:
127
+ params["urlProcessPattern"] = url_process_pattern
128
+ if obey_robots:
129
+ params["obeyRobots"] = 1
130
+ if use_proxies:
131
+ params["useProxies"] = 1
132
+ if custom_headers:
133
+ params["customHeaders"] = custom_headers
134
+ return params
135
+
136
+
137
+ def _parse_url_csv(text: str, seen_urls: set) -> List[CrawlEvent]:
138
+ events = []
139
+ reader = csv.DictReader(io.StringIO(text))
140
+ for row in reader:
141
+ url = row.get("Url", "").strip('"')
142
+ if url and url not in seen_urls:
143
+ seen_urls.add(url)
144
+ events.append(CrawlEvent(
145
+ event_type=CrawlEventType.URL_PROCESSED,
146
+ timestamp=row.get("Crawled Time", datetime.now().isoformat()),
147
+ details={"url": url, "status": row.get("Crawl Status", "unknown")},
148
+ ))
149
+ return events
150
+
151
+
152
+ async def crawl_async(
153
+ client: "DiffbotAsync",
154
+ site: str,
155
+ hops: int = 2,
156
+ job_name: Optional[str] = None,
157
+ max_to_crawl: int = 100,
158
+ max_to_process: int = 100,
159
+ restrict_domain: bool = True,
160
+ api_url: str = "",
161
+ crawl_delay: float = -1,
162
+ url_crawl_pattern: Optional[str] = None,
163
+ url_process_pattern: Optional[str] = None,
164
+ obey_robots: bool = False,
165
+ use_proxies: bool = False,
166
+ custom_headers: Optional[str] = None,
167
+ watch: bool = False,
168
+ poll_interval: float = 2.0,
169
+ ) -> AsyncIterator[CrawlEvent]:
170
+ if not site.startswith("http"):
171
+ site = f"https://{site}"
172
+ if not job_name:
173
+ job_name = f"crawl-{int(time.time())}"
174
+
175
+ params = _build_crawl_params(
176
+ client, job_name, site, max_to_crawl, max_to_process, hops, restrict_domain,
177
+ api_url, crawl_delay, url_crawl_pattern, url_process_pattern,
178
+ obey_robots, use_proxies, custom_headers,
179
+ )
180
+ response = await client._http.get(client.crawler_url, params=params)
181
+ client._raise_for_status(response)
182
+
183
+ yield CrawlEvent(
184
+ event_type=CrawlEventType.JOB_CREATED,
185
+ timestamp=datetime.now().isoformat(),
186
+ details={"job_name": job_name},
187
+ )
188
+
189
+ if not watch:
190
+ return
191
+
192
+ seen_urls: set = set()
193
+
194
+ while True:
195
+ status_response = await client._http.get(
196
+ client.crawler_url,
197
+ params={"token": client.token, "name": job_name},
198
+ )
199
+ client._raise_for_status(status_response)
200
+ jobs = status_response.json().get("jobs", [])
201
+ if not jobs:
202
+ break
203
+ job_status = jobs[0].get("jobStatus", {})
204
+ status_code = job_status.get("status", 0)
205
+
206
+ urls_response = await client._http.get(
207
+ f"{client.crawler_url}/data",
208
+ params={"token": client.token, "name": job_name, "type": "urls"},
209
+ follow_redirects=True,
210
+ )
211
+ client._raise_for_status(urls_response)
212
+ if urls_response.content:
213
+ for event in _parse_url_csv(urls_response.text, seen_urls):
214
+ yield event
215
+
216
+ if status_code not in (0, 7):
217
+ message = job_status.get("message", "")
218
+ if "fail" in message.lower() or "error" in message.lower():
219
+ raise APIError(500, f"Crawler job failed: {message}")
220
+ break
221
+
222
+ await asyncio.sleep(poll_interval)
223
+
224
+
225
+ async def crawl_list_jobs_async(client: "DiffbotAsync") -> List[Dict[str, Any]]:
226
+ response = await client._http.get(client.crawler_url, params={"token": client.token})
227
+ client._raise_for_status(response)
228
+ return response.json().get("jobs", [])
229
+
230
+
231
+ async def crawl_get_job_async(client: "DiffbotAsync", job_name: str) -> Dict[str, Any]:
232
+ response = await client._http.get(
233
+ client.crawler_url,
234
+ params={"token": client.token, "name": job_name},
235
+ )
236
+ client._raise_for_status(response)
237
+ jobs = response.json().get("jobs", [])
238
+ return jobs[0] if jobs else {}
239
+
240
+
241
+ async def crawl_delete_job_async(client: "DiffbotAsync", job_name: str) -> None:
242
+ response = await client._http.get(
243
+ client.crawler_url,
244
+ params={"token": client.token, "name": job_name, "delete": 1},
245
+ )
246
+ client._raise_for_status(response)
247
+
248
+
249
+ def crawl_list_jobs(client: "Diffbot") -> List[Dict[str, Any]]:
250
+ response = client._http.get(client.crawler_url, params={"token": client.token})
251
+ client._raise_for_status(response)
252
+ return response.json().get("jobs", [])
253
+
254
+
255
+ def crawl_get_job(client: "Diffbot", job_name: str) -> Dict[str, Any]:
256
+ response = client._http.get(
257
+ client.crawler_url,
258
+ params={"token": client.token, "name": job_name},
259
+ )
260
+ client._raise_for_status(response)
261
+ jobs = response.json().get("jobs", [])
262
+ return jobs[0] if jobs else {}
263
+
264
+
265
+ def crawl_delete_job(client: "Diffbot", job_name: str) -> None:
266
+ response = client._http.get(
267
+ client.crawler_url,
268
+ params={"token": client.token, "name": job_name, "delete": 1},
269
+ )
270
+ client._raise_for_status(response)
diffbot/errors.py ADDED
@@ -0,0 +1,51 @@
1
+ """Exception hierarchy for the Diffbot SDK."""
2
+
3
+ import json
4
+ from typing import Optional
5
+
6
+
7
+ class DiffbotError(Exception):
8
+ """Base class for all Diffbot SDK errors."""
9
+
10
+
11
+ class ValidationError(DiffbotError):
12
+ """Client-side validation failed (e.g. missing token, malformed argument)."""
13
+
14
+
15
+ class APIError(DiffbotError):
16
+ """The Diffbot API returned an error response."""
17
+
18
+ def __init__(self, status_code: int, body: str):
19
+ self.status_code = status_code
20
+ self.body = body
21
+ self.message: Optional[str] = None
22
+ self.request_id: Optional[str] = None
23
+ try:
24
+ data = json.loads(body)
25
+ self.message = data.get("message")
26
+ self.request_id = data.get("requestId")
27
+ except (ValueError, AttributeError):
28
+ pass
29
+ display = self.message or (body[:200] + ("..." if len(body) > 200 else ""))
30
+ super().__init__(f"Diffbot API error {status_code}: {display}")
31
+
32
+
33
+ class AuthError(APIError):
34
+ """Authentication failed (401, 403)."""
35
+
36
+
37
+ class RateLimitError(APIError):
38
+ """Rate limit exceeded (429)."""
39
+
40
+ def __init__(self, status_code: int, body: str, retry_after: Optional[str] = None):
41
+ super().__init__(status_code, body)
42
+ self.retry_after = retry_after
43
+
44
+
45
+ class ExtractionError(DiffbotError):
46
+ """The Diffbot API returned a 200 but reported an extraction failure."""
47
+
48
+ def __init__(self, error_code: int, error: str):
49
+ self.error_code = error_code
50
+ self.error = error
51
+ super().__init__(f"Diffbot extraction error {error_code}: {error}")
diffbot/extract.py ADDED
@@ -0,0 +1,45 @@
1
+ """Diffbot Analyze API: extract structured content from a URL."""
2
+
3
+ from typing import TYPE_CHECKING, Any, Dict
4
+
5
+ if TYPE_CHECKING:
6
+ from .client import Diffbot, DiffbotAsync
7
+
8
+ from .errors import ExtractionError
9
+
10
+
11
+ def _build_params(client: Any, url: str, fmt: str) -> Dict[str, Any]:
12
+ params = {"token": client.token, "url": url, "timeout": 30000}
13
+ if fmt == "markdown":
14
+ params["mode"] = "llm"
15
+ return params
16
+
17
+
18
+ def _parse_response(client: Any, data: Dict[str, Any]) -> Dict[str, Any]:
19
+ if "errorCode" in data:
20
+ raise ExtractionError(data["errorCode"], data.get("error", ""))
21
+ return data
22
+
23
+
24
+ def _normalize_url(url: str) -> str:
25
+ return url if url.startswith("http") else f"https://{url}"
26
+
27
+
28
+ def extract(client: "Diffbot", url: str, api: str = "analyze", fmt: str = "markdown") -> Dict[str, Any]:
29
+ url = _normalize_url(url)
30
+ response = client._http.get(
31
+ f"{client.analyze_url}/{api}",
32
+ params=_build_params(client, url, fmt),
33
+ )
34
+ client._raise_for_status(response)
35
+ return _parse_response(client, response.json())
36
+
37
+
38
+ async def extract_async(client: "DiffbotAsync", url: str, api: str = "analyze", fmt: str = "markdown") -> Dict[str, Any]:
39
+ url = _normalize_url(url)
40
+ response = await client._http.get(
41
+ f"{client.analyze_url}/{api}",
42
+ params=_build_params(client, url, fmt),
43
+ )
44
+ client._raise_for_status(response)
45
+ return _parse_response(client, response.json())