diffbot-python 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffbot/__init__.py +29 -0
- diffbot/ask.py +48 -0
- diffbot/cli/__init__.py +399 -0
- diffbot/cli/__main__.py +4 -0
- diffbot/cli/_common.py +36 -0
- diffbot/cli/dql.py +308 -0
- diffbot/cli/entities.py +155 -0
- diffbot/cli/ontology.py +130 -0
- diffbot/client.py +285 -0
- diffbot/crawl.py +270 -0
- diffbot/errors.py +51 -0
- diffbot/extract.py +45 -0
- diffbot/kg.py +90 -0
- diffbot/nlp.py +37 -0
- diffbot/web_search.py +44 -0
- diffbot_python-0.1.0.dist-info/METADATA +218 -0
- diffbot_python-0.1.0.dist-info/RECORD +20 -0
- diffbot_python-0.1.0.dist-info/WHEEL +4 -0
- diffbot_python-0.1.0.dist-info/entry_points.txt +2 -0
- diffbot_python-0.1.0.dist-info/licenses/LICENSE +21 -0
diffbot/client.py
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
"""The Diffbot client classes (sync and async)."""
|
|
2
|
+
|
|
3
|
+
import pathlib
|
|
4
|
+
from types import TracebackType
|
|
5
|
+
from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Sequence, Type, Union
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
|
|
9
|
+
from . import __version__
|
|
10
|
+
from .errors import APIError, AuthError, RateLimitError, ValidationError
|
|
11
|
+
from .extract import extract as _extract, extract_async as _extract_async
|
|
12
|
+
from .ask import ask as _ask, ask_async as _ask_async
|
|
13
|
+
from .crawl import (
|
|
14
|
+
CrawlEvent,
|
|
15
|
+
crawl as _crawl,
|
|
16
|
+
crawl_async as _crawl_async,
|
|
17
|
+
crawl_delete_job as _crawl_delete_job,
|
|
18
|
+
crawl_delete_job_async as _crawl_delete_job_async,
|
|
19
|
+
crawl_get_job as _crawl_get_job,
|
|
20
|
+
crawl_get_job_async as _crawl_get_job_async,
|
|
21
|
+
crawl_list_jobs as _crawl_list_jobs,
|
|
22
|
+
crawl_list_jobs_async as _crawl_list_jobs_async,
|
|
23
|
+
)
|
|
24
|
+
from .kg import (
|
|
25
|
+
dql as _dql,
|
|
26
|
+
dql_async as _dql_async,
|
|
27
|
+
dql_parallel as _dql_parallel,
|
|
28
|
+
dql_refresh_ontology as _dql_refresh_ontology,
|
|
29
|
+
)
|
|
30
|
+
from .web_search import (
|
|
31
|
+
WEB_SEARCH_BASE,
|
|
32
|
+
web_search as _web_search,
|
|
33
|
+
web_search_async as _web_search_async,
|
|
34
|
+
)
|
|
35
|
+
from .nlp import (
|
|
36
|
+
NLP_BASE,
|
|
37
|
+
entities as _entities,
|
|
38
|
+
entities_async as _entities_async,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
EXTRACT_BASE = "https://api.diffbot.com/v3"
|
|
42
|
+
CRAWL_BASE = "https://api.diffbot.com/v3/crawl"
|
|
43
|
+
DIFFBOT_LLM_BASE = "https://llm.diffbot.com/rag/v1/chat/completions"
|
|
44
|
+
DEFAULT_TIMEOUT = 30.0
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class Diffbot:
|
|
48
|
+
"""Client for the Diffbot APIs.
|
|
49
|
+
|
|
50
|
+
Example:
|
|
51
|
+
>>> from diffbot import Diffbot
|
|
52
|
+
>>> db = Diffbot(token=os.getenv("DIFFBOT_API_TOKEN"))
|
|
53
|
+
>>> db.extract("https://example.com")
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(
|
|
57
|
+
self,
|
|
58
|
+
token: str,
|
|
59
|
+
*,
|
|
60
|
+
timeout: float = DEFAULT_TIMEOUT,
|
|
61
|
+
analyze_url: str = EXTRACT_BASE,
|
|
62
|
+
llm_url: str = DIFFBOT_LLM_BASE,
|
|
63
|
+
crawler_url: str = CRAWL_BASE,
|
|
64
|
+
web_search_url: str = WEB_SEARCH_BASE,
|
|
65
|
+
nlp_url: str = NLP_BASE,
|
|
66
|
+
transport: Optional[httpx.BaseTransport] = None,
|
|
67
|
+
):
|
|
68
|
+
if not token:
|
|
69
|
+
raise ValidationError("token is required")
|
|
70
|
+
self.token = token
|
|
71
|
+
self.analyze_url = analyze_url
|
|
72
|
+
self.llm_url = llm_url
|
|
73
|
+
self.crawler_url = crawler_url
|
|
74
|
+
self.web_search_url = web_search_url
|
|
75
|
+
self.nlp_url = nlp_url
|
|
76
|
+
self._http = httpx.Client(
|
|
77
|
+
timeout=timeout,
|
|
78
|
+
headers={"User-Agent": f"diffbot-python/{__version__}"},
|
|
79
|
+
transport=transport,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def __enter__(self) -> "Diffbot":
|
|
83
|
+
return self
|
|
84
|
+
|
|
85
|
+
def __exit__(
|
|
86
|
+
self,
|
|
87
|
+
exc_type: Optional[Type[BaseException]],
|
|
88
|
+
exc: Optional[BaseException],
|
|
89
|
+
tb: Optional[TracebackType],
|
|
90
|
+
) -> None:
|
|
91
|
+
self.close()
|
|
92
|
+
|
|
93
|
+
def close(self) -> None:
|
|
94
|
+
self._http.close()
|
|
95
|
+
|
|
96
|
+
def _raise_for_status(self, response: httpx.Response) -> None:
|
|
97
|
+
if response.is_success:
|
|
98
|
+
return
|
|
99
|
+
status = response.status_code
|
|
100
|
+
body = response.text
|
|
101
|
+
if status in (401, 403):
|
|
102
|
+
raise AuthError(status, body)
|
|
103
|
+
if status == 429:
|
|
104
|
+
raise RateLimitError(status, body, retry_after=response.headers.get("retry-after"))
|
|
105
|
+
raise APIError(status, body)
|
|
106
|
+
|
|
107
|
+
def extract(self, url: str, api: str = "analyze", fmt: str = "markdown") -> Dict[str, Any]:
|
|
108
|
+
"""Extract structured content from a URL. Returns the raw Diffbot API response."""
|
|
109
|
+
return _extract(self, url, api=api, fmt=fmt)
|
|
110
|
+
|
|
111
|
+
def ask(self, messages: List[Dict[str, str]]) -> Iterator[str]:
|
|
112
|
+
"""Stream a response from the Diffbot LLM RAG API."""
|
|
113
|
+
yield from _ask(self, messages)
|
|
114
|
+
|
|
115
|
+
def crawl(self, site: str, **kwargs: Any) -> Iterator[CrawlEvent]:
|
|
116
|
+
"""Start a crawl job."""
|
|
117
|
+
yield from _crawl(self, site, **kwargs)
|
|
118
|
+
|
|
119
|
+
def crawl_list_jobs(self) -> List[Dict[str, Any]]:
|
|
120
|
+
"""List all crawler jobs for this token."""
|
|
121
|
+
return _crawl_list_jobs(self)
|
|
122
|
+
|
|
123
|
+
def crawl_get_job(self, job_name: str) -> Dict[str, Any]:
|
|
124
|
+
"""Get the status of a crawler job."""
|
|
125
|
+
return _crawl_get_job(self, job_name)
|
|
126
|
+
|
|
127
|
+
def crawl_delete_job(self, job_name: str) -> None:
|
|
128
|
+
"""Delete a crawler job."""
|
|
129
|
+
_crawl_delete_job(self, job_name)
|
|
130
|
+
|
|
131
|
+
def dql(
|
|
132
|
+
self,
|
|
133
|
+
query: str,
|
|
134
|
+
*,
|
|
135
|
+
size: int = 10,
|
|
136
|
+
from_: int = 0,
|
|
137
|
+
format: str = "json",
|
|
138
|
+
filter: Optional[str] = None,
|
|
139
|
+
exportspec: Optional[str] = None,
|
|
140
|
+
extra: Optional[Dict[str, str]] = None,
|
|
141
|
+
raw: bool = False,
|
|
142
|
+
) -> Union[Dict[str, Any], bytes]:
|
|
143
|
+
"""Run a DQL query against the Diffbot Knowledge Graph.
|
|
144
|
+
|
|
145
|
+
Returns the parsed JSON response, or the raw response bytes when raw=True
|
|
146
|
+
(e.g. to retrieve CSV/export formats undecoded).
|
|
147
|
+
"""
|
|
148
|
+
return _dql(self, query, size=size, from_=from_, format=format, filter=filter, exportspec=exportspec, extra=extra, raw=raw)
|
|
149
|
+
|
|
150
|
+
def dql_parallel(self, queries: Sequence[Dict[str, Any]], *, workers: int = 8) -> List[Union[Dict[str, Any], bytes]]:
|
|
151
|
+
"""Run multiple DQL queries concurrently. Each item is a dict of dql() keyword args."""
|
|
152
|
+
return _dql_parallel(self, queries, workers=workers)
|
|
153
|
+
|
|
154
|
+
def dql_refresh_ontology(self, dest: pathlib.Path) -> None:
|
|
155
|
+
"""Download the Diffbot Knowledge Graph ontology and write it to dest."""
|
|
156
|
+
_dql_refresh_ontology(self, dest)
|
|
157
|
+
|
|
158
|
+
def web_search(self, text: str, *, num_results: Optional[int] = None, max_tokens: Optional[int] = None) -> Dict[str, Any]:
|
|
159
|
+
"""Search the web via the Diffbot LLM web search API."""
|
|
160
|
+
return _web_search(self, text, num_results=num_results, max_tokens=max_tokens)
|
|
161
|
+
|
|
162
|
+
def entities(self, text: str, *, lang: str = "auto") -> Dict[str, Any]:
|
|
163
|
+
"""Identify and resolve entities and sentiment in text using the Diffbot NLP API.
|
|
164
|
+
|
|
165
|
+
Entity IDs can be looked up in the Knowledge Graph via dql() using
|
|
166
|
+
id:or("id1","id2","id3") — no type: declaration required.
|
|
167
|
+
"""
|
|
168
|
+
return _entities(self, text, lang=lang)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class DiffbotAsync:
|
|
172
|
+
"""Async client for Diffbot APIs.
|
|
173
|
+
|
|
174
|
+
Example:
|
|
175
|
+
>>> async with DiffbotAsync(token=os.getenv("DIFFBOT_API_TOKEN")) as db:
|
|
176
|
+
... result = await db.extract("https://example.com")
|
|
177
|
+
"""
|
|
178
|
+
|
|
179
|
+
def __init__(
|
|
180
|
+
self,
|
|
181
|
+
token: str,
|
|
182
|
+
*,
|
|
183
|
+
timeout: float = DEFAULT_TIMEOUT,
|
|
184
|
+
analyze_url: str = EXTRACT_BASE,
|
|
185
|
+
llm_url: str = DIFFBOT_LLM_BASE,
|
|
186
|
+
crawler_url: str = CRAWL_BASE,
|
|
187
|
+
web_search_url: str = WEB_SEARCH_BASE,
|
|
188
|
+
nlp_url: str = NLP_BASE,
|
|
189
|
+
transport: Optional[httpx.AsyncBaseTransport] = None,
|
|
190
|
+
):
|
|
191
|
+
if not token:
|
|
192
|
+
raise ValidationError("token is required")
|
|
193
|
+
self.token = token
|
|
194
|
+
self.analyze_url = analyze_url
|
|
195
|
+
self.llm_url = llm_url
|
|
196
|
+
self.crawler_url = crawler_url
|
|
197
|
+
self.web_search_url = web_search_url
|
|
198
|
+
self.nlp_url = nlp_url
|
|
199
|
+
self._http = httpx.AsyncClient(
|
|
200
|
+
timeout=timeout,
|
|
201
|
+
headers={"User-Agent": f"diffbot-python/{__version__}"},
|
|
202
|
+
transport=transport,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
async def __aenter__(self) -> "DiffbotAsync":
|
|
206
|
+
return self
|
|
207
|
+
|
|
208
|
+
async def __aexit__(
|
|
209
|
+
self,
|
|
210
|
+
exc_type: Optional[Type[BaseException]],
|
|
211
|
+
exc: Optional[BaseException],
|
|
212
|
+
tb: Optional[TracebackType],
|
|
213
|
+
) -> None:
|
|
214
|
+
await self.close()
|
|
215
|
+
|
|
216
|
+
async def close(self) -> None:
|
|
217
|
+
await self._http.aclose()
|
|
218
|
+
|
|
219
|
+
def _raise_for_status(self, response: httpx.Response) -> None:
|
|
220
|
+
if response.is_success:
|
|
221
|
+
return
|
|
222
|
+
status = response.status_code
|
|
223
|
+
body = response.text
|
|
224
|
+
if status in (401, 403):
|
|
225
|
+
raise AuthError(status, body)
|
|
226
|
+
if status == 429:
|
|
227
|
+
raise RateLimitError(status, body, retry_after=response.headers.get("retry-after"))
|
|
228
|
+
raise APIError(status, body)
|
|
229
|
+
|
|
230
|
+
async def extract(self, url: str, api: str = "analyze", fmt: str = "markdown") -> Dict[str, Any]:
|
|
231
|
+
"""Extract structured content from a URL. Returns the raw Diffbot API response."""
|
|
232
|
+
return await _extract_async(self, url, api=api, fmt=fmt)
|
|
233
|
+
|
|
234
|
+
async def ask(self, messages: List[Dict[str, str]]) -> AsyncIterator[str]:
|
|
235
|
+
"""Stream a response from the Diffbot LLM RAG API."""
|
|
236
|
+
async for chunk in _ask_async(self, messages):
|
|
237
|
+
yield chunk
|
|
238
|
+
|
|
239
|
+
async def crawl(self, site: str, **kwargs: Any) -> AsyncIterator[CrawlEvent]:
|
|
240
|
+
"""Start a crawl job. Pass watch=True to poll until completion and yield URL_PROCESSED events."""
|
|
241
|
+
async for event in _crawl_async(self, site, **kwargs):
|
|
242
|
+
yield event
|
|
243
|
+
|
|
244
|
+
async def crawl_list_jobs(self) -> List[Dict[str, Any]]:
|
|
245
|
+
"""List all crawler jobs for this token."""
|
|
246
|
+
return await _crawl_list_jobs_async(self)
|
|
247
|
+
|
|
248
|
+
async def crawl_get_job(self, job_name: str) -> Dict[str, Any]:
|
|
249
|
+
"""Get the status of a crawler job."""
|
|
250
|
+
return await _crawl_get_job_async(self, job_name)
|
|
251
|
+
|
|
252
|
+
async def crawl_delete_job(self, job_name: str) -> None:
|
|
253
|
+
"""Delete a crawler job."""
|
|
254
|
+
await _crawl_delete_job_async(self, job_name)
|
|
255
|
+
|
|
256
|
+
async def dql(
|
|
257
|
+
self,
|
|
258
|
+
query: str,
|
|
259
|
+
*,
|
|
260
|
+
size: int = 10,
|
|
261
|
+
from_: int = 0,
|
|
262
|
+
format: str = "json",
|
|
263
|
+
filter: Optional[str] = None,
|
|
264
|
+
exportspec: Optional[str] = None,
|
|
265
|
+
extra: Optional[Dict[str, str]] = None,
|
|
266
|
+
raw: bool = False,
|
|
267
|
+
) -> Union[Dict[str, Any], bytes]:
|
|
268
|
+
"""Run a DQL query against the Diffbot Knowledge Graph.
|
|
269
|
+
|
|
270
|
+
Returns the parsed JSON response, or the raw response bytes when raw=True
|
|
271
|
+
(e.g. to retrieve CSV/export formats undecoded).
|
|
272
|
+
"""
|
|
273
|
+
return await _dql_async(self, query, size=size, from_=from_, format=format, filter=filter, exportspec=exportspec, extra=extra, raw=raw)
|
|
274
|
+
|
|
275
|
+
async def web_search(self, text: str, *, num_results: Optional[int] = None, max_tokens: Optional[int] = None) -> Dict[str, Any]:
|
|
276
|
+
"""Search the web via the Diffbot LLM web search API."""
|
|
277
|
+
return await _web_search_async(self, text, num_results=num_results, max_tokens=max_tokens)
|
|
278
|
+
|
|
279
|
+
async def entities(self, text: str, *, lang: str = "auto") -> Dict[str, Any]:
|
|
280
|
+
"""Identify and resolve entities and sentiment in text using the Diffbot NLP API.
|
|
281
|
+
|
|
282
|
+
Entity IDs can be looked up in the Knowledge Graph via dql() using
|
|
283
|
+
id:or("id1","id2","id3") — no type: declaration required.
|
|
284
|
+
"""
|
|
285
|
+
return await _entities_async(self, text, lang=lang)
|
diffbot/crawl.py
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""Diffbot Crawler API: start crawls, manage crawler jobs."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import csv
|
|
5
|
+
import io
|
|
6
|
+
import time
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from enum import Enum
|
|
10
|
+
from typing import TYPE_CHECKING, Any, AsyncIterator, Dict, Iterator, List, Optional
|
|
11
|
+
|
|
12
|
+
from .errors import APIError
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from .client import Diffbot, DiffbotAsync
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CrawlEventType(Enum):
|
|
19
|
+
JOB_CREATED = "job_created"
|
|
20
|
+
URL_PROCESSED = "url_processed"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class CrawlEvent:
|
|
25
|
+
"""An event yielded while a crawl is in flight."""
|
|
26
|
+
event_type: CrawlEventType
|
|
27
|
+
timestamp: str
|
|
28
|
+
details: Dict[str, Any]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def crawl(
|
|
32
|
+
client: "Diffbot",
|
|
33
|
+
site: str,
|
|
34
|
+
hops: int = 2,
|
|
35
|
+
job_name: Optional[str] = None,
|
|
36
|
+
max_to_crawl: int = 100,
|
|
37
|
+
max_to_process: int = 100,
|
|
38
|
+
restrict_domain: bool = True,
|
|
39
|
+
api_url: str = "",
|
|
40
|
+
crawl_delay: float = -1,
|
|
41
|
+
url_crawl_pattern: Optional[str] = None,
|
|
42
|
+
url_process_pattern: Optional[str] = None,
|
|
43
|
+
obey_robots: bool = False,
|
|
44
|
+
use_proxies: bool = False,
|
|
45
|
+
custom_headers: Optional[str] = None,
|
|
46
|
+
watch: bool = False,
|
|
47
|
+
poll_interval: float = 2.0,
|
|
48
|
+
) -> Iterator[CrawlEvent]:
|
|
49
|
+
"""Create a crawler job and yield a JOB_CREATED event. If watch=True, poll until
|
|
50
|
+
completion and yield URL_PROCESSED events for each crawled URL."""
|
|
51
|
+
if not site.startswith("http"):
|
|
52
|
+
site = f"https://{site}"
|
|
53
|
+
if not job_name:
|
|
54
|
+
job_name = f"crawl-{int(time.time())}"
|
|
55
|
+
|
|
56
|
+
params = _build_crawl_params(
|
|
57
|
+
client, job_name, site, max_to_crawl, max_to_process, hops, restrict_domain,
|
|
58
|
+
api_url, crawl_delay, url_crawl_pattern, url_process_pattern,
|
|
59
|
+
obey_robots, use_proxies, custom_headers,
|
|
60
|
+
)
|
|
61
|
+
response = client._http.get(client.crawler_url, params=params)
|
|
62
|
+
client._raise_for_status(response)
|
|
63
|
+
|
|
64
|
+
yield CrawlEvent(
|
|
65
|
+
event_type=CrawlEventType.JOB_CREATED,
|
|
66
|
+
timestamp=datetime.now().isoformat(),
|
|
67
|
+
details={"job_name": job_name},
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
if not watch:
|
|
71
|
+
return
|
|
72
|
+
|
|
73
|
+
seen_urls: set = set()
|
|
74
|
+
|
|
75
|
+
while True:
|
|
76
|
+
status_response = client._http.get(
|
|
77
|
+
client.crawler_url,
|
|
78
|
+
params={"token": client.token, "name": job_name},
|
|
79
|
+
)
|
|
80
|
+
client._raise_for_status(status_response)
|
|
81
|
+
jobs = status_response.json().get("jobs", [])
|
|
82
|
+
if not jobs:
|
|
83
|
+
break
|
|
84
|
+
job_status = jobs[0].get("jobStatus", {})
|
|
85
|
+
status_code = job_status.get("status", 0)
|
|
86
|
+
|
|
87
|
+
urls_response = client._http.get(
|
|
88
|
+
f"{client.crawler_url}/data",
|
|
89
|
+
params={"token": client.token, "name": job_name, "type": "urls"},
|
|
90
|
+
follow_redirects=True,
|
|
91
|
+
)
|
|
92
|
+
client._raise_for_status(urls_response)
|
|
93
|
+
if urls_response.content:
|
|
94
|
+
for event in _parse_url_csv(urls_response.text, seen_urls):
|
|
95
|
+
yield event
|
|
96
|
+
|
|
97
|
+
if status_code not in (0, 7):
|
|
98
|
+
message = job_status.get("message", "")
|
|
99
|
+
if "fail" in message.lower() or "error" in message.lower():
|
|
100
|
+
raise APIError(500, f"Crawler job failed: {message}")
|
|
101
|
+
break
|
|
102
|
+
|
|
103
|
+
time.sleep(poll_interval)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _build_crawl_params(client: Any, job_name: str, site: str, max_to_crawl: int,
|
|
107
|
+
max_to_process: int, hops: int, restrict_domain: bool,
|
|
108
|
+
api_url: str, crawl_delay: float, url_crawl_pattern: Optional[str],
|
|
109
|
+
url_process_pattern: Optional[str], obey_robots: bool,
|
|
110
|
+
use_proxies: bool, custom_headers: Optional[str]) -> Dict[str, Any]:
|
|
111
|
+
params: Dict[str, Any] = {
|
|
112
|
+
"token": client.token,
|
|
113
|
+
"name": job_name,
|
|
114
|
+
"seeds": site,
|
|
115
|
+
"maxToCrawl": max_to_crawl,
|
|
116
|
+
"maxToProcess": max_to_process,
|
|
117
|
+
"maxHops": hops,
|
|
118
|
+
"restrictDomain": 1 if restrict_domain else 0,
|
|
119
|
+
}
|
|
120
|
+
if api_url:
|
|
121
|
+
params["apiUrl"] = api_url
|
|
122
|
+
if crawl_delay > 0:
|
|
123
|
+
params["crawlDelay"] = crawl_delay
|
|
124
|
+
if url_crawl_pattern:
|
|
125
|
+
params["urlCrawlPattern"] = url_crawl_pattern
|
|
126
|
+
if url_process_pattern:
|
|
127
|
+
params["urlProcessPattern"] = url_process_pattern
|
|
128
|
+
if obey_robots:
|
|
129
|
+
params["obeyRobots"] = 1
|
|
130
|
+
if use_proxies:
|
|
131
|
+
params["useProxies"] = 1
|
|
132
|
+
if custom_headers:
|
|
133
|
+
params["customHeaders"] = custom_headers
|
|
134
|
+
return params
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _parse_url_csv(text: str, seen_urls: set) -> List[CrawlEvent]:
|
|
138
|
+
events = []
|
|
139
|
+
reader = csv.DictReader(io.StringIO(text))
|
|
140
|
+
for row in reader:
|
|
141
|
+
url = row.get("Url", "").strip('"')
|
|
142
|
+
if url and url not in seen_urls:
|
|
143
|
+
seen_urls.add(url)
|
|
144
|
+
events.append(CrawlEvent(
|
|
145
|
+
event_type=CrawlEventType.URL_PROCESSED,
|
|
146
|
+
timestamp=row.get("Crawled Time", datetime.now().isoformat()),
|
|
147
|
+
details={"url": url, "status": row.get("Crawl Status", "unknown")},
|
|
148
|
+
))
|
|
149
|
+
return events
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
async def crawl_async(
|
|
153
|
+
client: "DiffbotAsync",
|
|
154
|
+
site: str,
|
|
155
|
+
hops: int = 2,
|
|
156
|
+
job_name: Optional[str] = None,
|
|
157
|
+
max_to_crawl: int = 100,
|
|
158
|
+
max_to_process: int = 100,
|
|
159
|
+
restrict_domain: bool = True,
|
|
160
|
+
api_url: str = "",
|
|
161
|
+
crawl_delay: float = -1,
|
|
162
|
+
url_crawl_pattern: Optional[str] = None,
|
|
163
|
+
url_process_pattern: Optional[str] = None,
|
|
164
|
+
obey_robots: bool = False,
|
|
165
|
+
use_proxies: bool = False,
|
|
166
|
+
custom_headers: Optional[str] = None,
|
|
167
|
+
watch: bool = False,
|
|
168
|
+
poll_interval: float = 2.0,
|
|
169
|
+
) -> AsyncIterator[CrawlEvent]:
|
|
170
|
+
if not site.startswith("http"):
|
|
171
|
+
site = f"https://{site}"
|
|
172
|
+
if not job_name:
|
|
173
|
+
job_name = f"crawl-{int(time.time())}"
|
|
174
|
+
|
|
175
|
+
params = _build_crawl_params(
|
|
176
|
+
client, job_name, site, max_to_crawl, max_to_process, hops, restrict_domain,
|
|
177
|
+
api_url, crawl_delay, url_crawl_pattern, url_process_pattern,
|
|
178
|
+
obey_robots, use_proxies, custom_headers,
|
|
179
|
+
)
|
|
180
|
+
response = await client._http.get(client.crawler_url, params=params)
|
|
181
|
+
client._raise_for_status(response)
|
|
182
|
+
|
|
183
|
+
yield CrawlEvent(
|
|
184
|
+
event_type=CrawlEventType.JOB_CREATED,
|
|
185
|
+
timestamp=datetime.now().isoformat(),
|
|
186
|
+
details={"job_name": job_name},
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
if not watch:
|
|
190
|
+
return
|
|
191
|
+
|
|
192
|
+
seen_urls: set = set()
|
|
193
|
+
|
|
194
|
+
while True:
|
|
195
|
+
status_response = await client._http.get(
|
|
196
|
+
client.crawler_url,
|
|
197
|
+
params={"token": client.token, "name": job_name},
|
|
198
|
+
)
|
|
199
|
+
client._raise_for_status(status_response)
|
|
200
|
+
jobs = status_response.json().get("jobs", [])
|
|
201
|
+
if not jobs:
|
|
202
|
+
break
|
|
203
|
+
job_status = jobs[0].get("jobStatus", {})
|
|
204
|
+
status_code = job_status.get("status", 0)
|
|
205
|
+
|
|
206
|
+
urls_response = await client._http.get(
|
|
207
|
+
f"{client.crawler_url}/data",
|
|
208
|
+
params={"token": client.token, "name": job_name, "type": "urls"},
|
|
209
|
+
follow_redirects=True,
|
|
210
|
+
)
|
|
211
|
+
client._raise_for_status(urls_response)
|
|
212
|
+
if urls_response.content:
|
|
213
|
+
for event in _parse_url_csv(urls_response.text, seen_urls):
|
|
214
|
+
yield event
|
|
215
|
+
|
|
216
|
+
if status_code not in (0, 7):
|
|
217
|
+
message = job_status.get("message", "")
|
|
218
|
+
if "fail" in message.lower() or "error" in message.lower():
|
|
219
|
+
raise APIError(500, f"Crawler job failed: {message}")
|
|
220
|
+
break
|
|
221
|
+
|
|
222
|
+
await asyncio.sleep(poll_interval)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
async def crawl_list_jobs_async(client: "DiffbotAsync") -> List[Dict[str, Any]]:
|
|
226
|
+
response = await client._http.get(client.crawler_url, params={"token": client.token})
|
|
227
|
+
client._raise_for_status(response)
|
|
228
|
+
return response.json().get("jobs", [])
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
async def crawl_get_job_async(client: "DiffbotAsync", job_name: str) -> Dict[str, Any]:
|
|
232
|
+
response = await client._http.get(
|
|
233
|
+
client.crawler_url,
|
|
234
|
+
params={"token": client.token, "name": job_name},
|
|
235
|
+
)
|
|
236
|
+
client._raise_for_status(response)
|
|
237
|
+
jobs = response.json().get("jobs", [])
|
|
238
|
+
return jobs[0] if jobs else {}
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
async def crawl_delete_job_async(client: "DiffbotAsync", job_name: str) -> None:
|
|
242
|
+
response = await client._http.get(
|
|
243
|
+
client.crawler_url,
|
|
244
|
+
params={"token": client.token, "name": job_name, "delete": 1},
|
|
245
|
+
)
|
|
246
|
+
client._raise_for_status(response)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def crawl_list_jobs(client: "Diffbot") -> List[Dict[str, Any]]:
|
|
250
|
+
response = client._http.get(client.crawler_url, params={"token": client.token})
|
|
251
|
+
client._raise_for_status(response)
|
|
252
|
+
return response.json().get("jobs", [])
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def crawl_get_job(client: "Diffbot", job_name: str) -> Dict[str, Any]:
|
|
256
|
+
response = client._http.get(
|
|
257
|
+
client.crawler_url,
|
|
258
|
+
params={"token": client.token, "name": job_name},
|
|
259
|
+
)
|
|
260
|
+
client._raise_for_status(response)
|
|
261
|
+
jobs = response.json().get("jobs", [])
|
|
262
|
+
return jobs[0] if jobs else {}
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def crawl_delete_job(client: "Diffbot", job_name: str) -> None:
|
|
266
|
+
response = client._http.get(
|
|
267
|
+
client.crawler_url,
|
|
268
|
+
params={"token": client.token, "name": job_name, "delete": 1},
|
|
269
|
+
)
|
|
270
|
+
client._raise_for_status(response)
|
diffbot/errors.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Exception hierarchy for the Diffbot SDK."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DiffbotError(Exception):
|
|
8
|
+
"""Base class for all Diffbot SDK errors."""
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ValidationError(DiffbotError):
|
|
12
|
+
"""Client-side validation failed (e.g. missing token, malformed argument)."""
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class APIError(DiffbotError):
|
|
16
|
+
"""The Diffbot API returned an error response."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, status_code: int, body: str):
|
|
19
|
+
self.status_code = status_code
|
|
20
|
+
self.body = body
|
|
21
|
+
self.message: Optional[str] = None
|
|
22
|
+
self.request_id: Optional[str] = None
|
|
23
|
+
try:
|
|
24
|
+
data = json.loads(body)
|
|
25
|
+
self.message = data.get("message")
|
|
26
|
+
self.request_id = data.get("requestId")
|
|
27
|
+
except (ValueError, AttributeError):
|
|
28
|
+
pass
|
|
29
|
+
display = self.message or (body[:200] + ("..." if len(body) > 200 else ""))
|
|
30
|
+
super().__init__(f"Diffbot API error {status_code}: {display}")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class AuthError(APIError):
|
|
34
|
+
"""Authentication failed (401, 403)."""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class RateLimitError(APIError):
|
|
38
|
+
"""Rate limit exceeded (429)."""
|
|
39
|
+
|
|
40
|
+
def __init__(self, status_code: int, body: str, retry_after: Optional[str] = None):
|
|
41
|
+
super().__init__(status_code, body)
|
|
42
|
+
self.retry_after = retry_after
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class ExtractionError(DiffbotError):
|
|
46
|
+
"""The Diffbot API returned a 200 but reported an extraction failure."""
|
|
47
|
+
|
|
48
|
+
def __init__(self, error_code: int, error: str):
|
|
49
|
+
self.error_code = error_code
|
|
50
|
+
self.error = error
|
|
51
|
+
super().__init__(f"Diffbot extraction error {error_code}: {error}")
|
diffbot/extract.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Diffbot Analyze API: extract structured content from a URL."""
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Dict
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from .client import Diffbot, DiffbotAsync
|
|
7
|
+
|
|
8
|
+
from .errors import ExtractionError
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _build_params(client: Any, url: str, fmt: str) -> Dict[str, Any]:
|
|
12
|
+
params = {"token": client.token, "url": url, "timeout": 30000}
|
|
13
|
+
if fmt == "markdown":
|
|
14
|
+
params["mode"] = "llm"
|
|
15
|
+
return params
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _parse_response(client: Any, data: Dict[str, Any]) -> Dict[str, Any]:
|
|
19
|
+
if "errorCode" in data:
|
|
20
|
+
raise ExtractionError(data["errorCode"], data.get("error", ""))
|
|
21
|
+
return data
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _normalize_url(url: str) -> str:
|
|
25
|
+
return url if url.startswith("http") else f"https://{url}"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def extract(client: "Diffbot", url: str, api: str = "analyze", fmt: str = "markdown") -> Dict[str, Any]:
|
|
29
|
+
url = _normalize_url(url)
|
|
30
|
+
response = client._http.get(
|
|
31
|
+
f"{client.analyze_url}/{api}",
|
|
32
|
+
params=_build_params(client, url, fmt),
|
|
33
|
+
)
|
|
34
|
+
client._raise_for_status(response)
|
|
35
|
+
return _parse_response(client, response.json())
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
async def extract_async(client: "DiffbotAsync", url: str, api: str = "analyze", fmt: str = "markdown") -> Dict[str, Any]:
|
|
39
|
+
url = _normalize_url(url)
|
|
40
|
+
response = await client._http.get(
|
|
41
|
+
f"{client.analyze_url}/{api}",
|
|
42
|
+
params=_build_params(client, url, fmt),
|
|
43
|
+
)
|
|
44
|
+
client._raise_for_status(response)
|
|
45
|
+
return _parse_response(client, response.json())
|