diffbot-python 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diffbot/__init__.py +10 -1
- diffbot/_auth.py +41 -0
- diffbot/cli/_common.py +8 -23
- diffbot/cli/dql.py +3 -1
- diffbot/cli/ontology.py +31 -87
- diffbot/client.py +18 -2
- diffbot/kg.py +38 -0
- diffbot/ontology.py +160 -0
- diffbot/web_search.py +2 -2
- {diffbot_python-0.1.0.dist-info → diffbot_python-0.2.1.dist-info}/METADATA +55 -7
- diffbot_python-0.2.1.dist-info/RECORD +22 -0
- {diffbot_python-0.1.0.dist-info → diffbot_python-0.2.1.dist-info}/WHEEL +1 -1
- diffbot_python-0.1.0.dist-info/RECORD +0 -20
- {diffbot_python-0.1.0.dist-info → diffbot_python-0.2.1.dist-info}/entry_points.txt +0 -0
- {diffbot_python-0.1.0.dist-info → diffbot_python-0.2.1.dist-info}/licenses/LICENSE +0 -0
diffbot/__init__.py
CHANGED
|
@@ -2,8 +2,14 @@
|
|
|
2
2
|
diffbot - Python client library for the Diffbot APIs.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
from importlib.metadata import PackageNotFoundError, version as _version
|
|
6
6
|
|
|
7
|
+
try:
|
|
8
|
+
__version__ = _version("diffbot-python")
|
|
9
|
+
except PackageNotFoundError: # not installed (e.g. running from a source tree)
|
|
10
|
+
__version__ = "0.0.0"
|
|
11
|
+
|
|
12
|
+
from ._auth import resolve_token
|
|
7
13
|
from .client import Diffbot, DiffbotAsync
|
|
8
14
|
from .crawl import CrawlEvent, CrawlEventType
|
|
9
15
|
from .errors import (
|
|
@@ -14,12 +20,15 @@ from .errors import (
|
|
|
14
20
|
RateLimitError,
|
|
15
21
|
ValidationError,
|
|
16
22
|
)
|
|
23
|
+
from .ontology import Ontology
|
|
17
24
|
|
|
18
25
|
__all__ = [
|
|
19
26
|
"Diffbot",
|
|
20
27
|
"DiffbotAsync",
|
|
28
|
+
"resolve_token",
|
|
21
29
|
"CrawlEvent",
|
|
22
30
|
"CrawlEventType",
|
|
31
|
+
"Ontology",
|
|
23
32
|
"DiffbotError",
|
|
24
33
|
"AuthError",
|
|
25
34
|
"ExtractionError",
|
diffbot/_auth.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Shared Diffbot credential resolution for both the library and the CLI.
|
|
2
|
+
|
|
3
|
+
The same lookup chain is used everywhere so a single credential works for the
|
|
4
|
+
``db`` CLI and any Python script that constructs a client:
|
|
5
|
+
|
|
6
|
+
1. An explicit token passed to the client / function.
|
|
7
|
+
2. The ``DIFFBOT_API_TOKEN`` environment variable.
|
|
8
|
+
3. A ``DIFFBOT_API_TOKEN=...`` line in ``~/.diffbot/credentials``.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
import pathlib
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
TOKEN_ENV_VAR = "DIFFBOT_API_TOKEN"
|
|
16
|
+
CREDENTIALS_PATH = pathlib.Path.home() / ".diffbot" / "credentials"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _read_credentials_file() -> str:
|
|
20
|
+
if not CREDENTIALS_PATH.exists():
|
|
21
|
+
return ""
|
|
22
|
+
for line in CREDENTIALS_PATH.read_text().splitlines():
|
|
23
|
+
line = line.strip()
|
|
24
|
+
if line.startswith(f"{TOKEN_ENV_VAR}="):
|
|
25
|
+
return line[len(TOKEN_ENV_VAR) + 1:].strip()
|
|
26
|
+
return ""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def resolve_token(token: Optional[str] = None) -> str:
|
|
30
|
+
"""Resolve a Diffbot API token from the explicit argument, env var, or file.
|
|
31
|
+
|
|
32
|
+
Returns an empty string if no token can be found.
|
|
33
|
+
"""
|
|
34
|
+
if token and token.strip():
|
|
35
|
+
return token.strip()
|
|
36
|
+
|
|
37
|
+
env_token = os.environ.get(TOKEN_ENV_VAR, "").strip()
|
|
38
|
+
if env_token:
|
|
39
|
+
return env_token
|
|
40
|
+
|
|
41
|
+
return _read_credentials_file()
|
diffbot/cli/_common.py
CHANGED
|
@@ -1,35 +1,20 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import pathlib
|
|
3
|
-
|
|
4
1
|
import click
|
|
5
2
|
|
|
6
|
-
from diffbot import Diffbot
|
|
7
|
-
|
|
8
|
-
CREDENTIALS_PATH = pathlib.Path.home() / ".diffbot" / "credentials"
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def resolve_token() -> str:
|
|
12
|
-
"""Return the Diffbot API token from the env var, falling back to ~/.diffbot/credentials."""
|
|
13
|
-
token = os.environ.get("DIFFBOT_API_TOKEN", "").strip()
|
|
14
|
-
if token:
|
|
15
|
-
return token
|
|
16
|
-
|
|
17
|
-
if CREDENTIALS_PATH.exists():
|
|
18
|
-
for line in CREDENTIALS_PATH.read_text().splitlines():
|
|
19
|
-
line = line.strip()
|
|
20
|
-
if line.startswith("DIFFBOT_API_TOKEN="):
|
|
21
|
-
return line[len("DIFFBOT_API_TOKEN="):].strip()
|
|
22
|
-
|
|
23
|
-
return ""
|
|
3
|
+
from diffbot import Diffbot, resolve_token
|
|
4
|
+
from diffbot._auth import CREDENTIALS_PATH, TOKEN_ENV_VAR
|
|
24
5
|
|
|
25
6
|
|
|
26
7
|
def get_client() -> Diffbot:
|
|
8
|
+
"""Build a Diffbot client using the shared credential resolution chain.
|
|
9
|
+
|
|
10
|
+
Looks at the DIFFBOT_API_TOKEN env var, then ~/.diffbot/credentials.
|
|
11
|
+
"""
|
|
27
12
|
token = resolve_token()
|
|
28
13
|
if not token:
|
|
29
14
|
click.echo(
|
|
30
15
|
"Error: no Diffbot API token found.\n"
|
|
31
|
-
" Set a
|
|
32
|
-
f" write '
|
|
16
|
+
f" Set a {TOKEN_ENV_VAR} environment variable, or\n"
|
|
17
|
+
f" write '{TOKEN_ENV_VAR}=YOUR_TOKEN' to {CREDENTIALS_PATH}",
|
|
33
18
|
err=True,
|
|
34
19
|
)
|
|
35
20
|
raise click.Abort()
|
diffbot/cli/dql.py
CHANGED
|
@@ -15,7 +15,9 @@ from rich.table import Table
|
|
|
15
15
|
from diffbot import DiffbotError
|
|
16
16
|
|
|
17
17
|
from . import ontology
|
|
18
|
-
from
|
|
18
|
+
from diffbot import resolve_token
|
|
19
|
+
|
|
20
|
+
from ._common import get_client
|
|
19
21
|
|
|
20
22
|
|
|
21
23
|
class _DqlGroup(click.Group):
|
diffbot/cli/ontology.py
CHANGED
|
@@ -1,14 +1,24 @@
|
|
|
1
|
+
"""CLI-side ontology access: a disk cache over the storage-agnostic core.
|
|
2
|
+
|
|
3
|
+
The navigation logic lives in :mod:`diffbot.ontology` (the `Ontology` class).
|
|
4
|
+
This module adds the CLI's caching policy on top: the ontology is read once from
|
|
5
|
+
``~/.diffbot/ontology.json`` (populated by `db dql init`) and held in
|
|
6
|
+
``_CACHE``. The module-level functions preserve the historical CLI surface and
|
|
7
|
+
simply delegate to an `Ontology` built from the cached document.
|
|
8
|
+
"""
|
|
9
|
+
|
|
1
10
|
import json
|
|
2
11
|
import pathlib
|
|
3
|
-
import
|
|
4
|
-
|
|
12
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
13
|
+
|
|
14
|
+
from diffbot.ontology import Ontology
|
|
5
15
|
|
|
6
16
|
ONTOLOGY_PATH = pathlib.Path.home() / ".diffbot" / "ontology.json"
|
|
7
17
|
|
|
8
18
|
_CACHE: Dict[str, Any] = {}
|
|
9
19
|
|
|
10
20
|
|
|
11
|
-
def
|
|
21
|
+
def _data() -> Dict[str, Any]:
|
|
12
22
|
if "data" not in _CACHE:
|
|
13
23
|
if not ONTOLOGY_PATH.exists():
|
|
14
24
|
raise FileNotFoundError(
|
|
@@ -18,113 +28,47 @@ def load() -> Dict[str, Any]:
|
|
|
18
28
|
return _CACHE["data"]
|
|
19
29
|
|
|
20
30
|
|
|
31
|
+
def _ontology() -> Ontology:
|
|
32
|
+
return Ontology(_data())
|
|
33
|
+
|
|
34
|
+
|
|
21
35
|
def list_types() -> List[str]:
|
|
22
|
-
return
|
|
36
|
+
return _ontology().types()
|
|
23
37
|
|
|
24
38
|
|
|
25
39
|
def list_composites() -> List[str]:
|
|
26
|
-
return
|
|
40
|
+
return _ontology().composites()
|
|
27
41
|
|
|
28
42
|
|
|
29
43
|
def list_enums() -> List[str]:
|
|
30
|
-
return
|
|
44
|
+
return _ontology().enums()
|
|
31
45
|
|
|
32
46
|
|
|
33
47
|
def list_taxonomies() -> List[str]:
|
|
34
|
-
return
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def _fields_of(container: Dict[str, Any], type_name: str) -> Dict[str, Any]:
|
|
38
|
-
entry = container.get(type_name)
|
|
39
|
-
if entry is None:
|
|
40
|
-
raise KeyError(f"Unknown name: {type_name}")
|
|
41
|
-
return entry.get("fields", {})
|
|
48
|
+
return _ontology().taxonomies()
|
|
42
49
|
|
|
43
50
|
|
|
44
51
|
def fields_for(type_name: str) -> Dict[str, Any]:
|
|
45
|
-
|
|
46
|
-
types = data.get("types", {})
|
|
47
|
-
composites = data.get("composites", {})
|
|
48
|
-
if type_name in types:
|
|
49
|
-
return _fields_of(types, type_name)
|
|
50
|
-
if type_name in composites:
|
|
51
|
-
return _fields_of(composites, type_name)
|
|
52
|
-
raise KeyError(f"{type_name} is not a known entity type or composite")
|
|
52
|
+
return _ontology().fields_for(type_name)
|
|
53
53
|
|
|
54
54
|
|
|
55
55
|
def format_field(name: str, meta: Dict[str, Any]) -> str:
|
|
56
|
-
|
|
57
|
-
if t == "LinkedEntity":
|
|
58
|
-
le = meta.get("leType") or []
|
|
59
|
-
if le:
|
|
60
|
-
t = f"LinkedEntity ({le[0]})"
|
|
61
|
-
flags = []
|
|
62
|
-
if meta.get("isList"):
|
|
63
|
-
flags.append("isList")
|
|
64
|
-
if meta.get("isComposite"):
|
|
65
|
-
flags.append("isComposite")
|
|
66
|
-
if meta.get("isEnum"):
|
|
67
|
-
flags.append("isEnum")
|
|
68
|
-
if meta.get("isDeprecated"):
|
|
69
|
-
flags.append("DEPRECATED")
|
|
70
|
-
suffix = "".join(f" [{f}]" for f in flags)
|
|
71
|
-
return f"{name}: [{t}]{suffix}"
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def filter_fields(fields: Dict[str, Any], search: Optional[str], include_deprecated: bool = False) -> List[tuple]:
|
|
75
|
-
pattern = re.compile(search, re.IGNORECASE) if search else None
|
|
76
|
-
out = []
|
|
77
|
-
for name, meta in fields.items():
|
|
78
|
-
if not include_deprecated and meta.get("isDeprecated"):
|
|
79
|
-
continue
|
|
80
|
-
if pattern and not pattern.search(name):
|
|
81
|
-
continue
|
|
82
|
-
out.append((name, meta))
|
|
83
|
-
return out
|
|
56
|
+
return Ontology.format_field(name, meta)
|
|
84
57
|
|
|
85
58
|
|
|
86
|
-
def
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
raise KeyError(f"Unknown taxonomy: {name}")
|
|
91
|
-
pattern = re.compile(search, re.IGNORECASE) if search else None
|
|
92
|
-
out: List[str] = []
|
|
59
|
+
def filter_fields(
|
|
60
|
+
fields: Dict[str, Any], search: Optional[str], include_deprecated: bool = False
|
|
61
|
+
) -> List[Tuple[str, Dict[str, Any]]]:
|
|
62
|
+
return Ontology.filter_fields(fields, search, include_deprecated=include_deprecated)
|
|
93
63
|
|
|
94
|
-
def walk(node: Dict[str, Any]) -> None:
|
|
95
|
-
n = node.get("name")
|
|
96
|
-
if n and (pattern is None or pattern.search(n)):
|
|
97
|
-
out.append(n)
|
|
98
|
-
for child in node.get("children", []) or []:
|
|
99
|
-
walk(child)
|
|
100
64
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
return out
|
|
65
|
+
def taxonomy_values(name: str, search: Optional[str] = None) -> List[str]:
|
|
66
|
+
return _ontology().taxonomy_values(name, search)
|
|
104
67
|
|
|
105
68
|
|
|
106
69
|
def enum_values(name: str) -> List[str]:
|
|
107
|
-
|
|
108
|
-
enum = data.get("enums", {}).get(name)
|
|
109
|
-
if enum is None:
|
|
110
|
-
raise KeyError(f"Unknown enum: {name}")
|
|
111
|
-
return list(enum.get("values", []))
|
|
70
|
+
return _ontology().enum_values(name)
|
|
112
71
|
|
|
113
72
|
|
|
114
73
|
def find_named(search: str) -> List[str]:
|
|
115
|
-
|
|
116
|
-
found = set()
|
|
117
|
-
|
|
118
|
-
def walk(node: Any) -> None:
|
|
119
|
-
if isinstance(node, dict):
|
|
120
|
-
n = node.get("name")
|
|
121
|
-
if isinstance(n, str) and pattern.search(n):
|
|
122
|
-
found.add(n)
|
|
123
|
-
for v in node.values():
|
|
124
|
-
walk(v)
|
|
125
|
-
elif isinstance(node, list):
|
|
126
|
-
for v in node:
|
|
127
|
-
walk(v)
|
|
128
|
-
|
|
129
|
-
walk(load())
|
|
130
|
-
return sorted(found)
|
|
74
|
+
return _ontology().find_named(search)
|
diffbot/client.py
CHANGED
|
@@ -24,9 +24,13 @@ from .crawl import (
|
|
|
24
24
|
from .kg import (
|
|
25
25
|
dql as _dql,
|
|
26
26
|
dql_async as _dql_async,
|
|
27
|
+
dql_fetch_ontology as _dql_fetch_ontology,
|
|
28
|
+
dql_fetch_ontology_async as _dql_fetch_ontology_async,
|
|
27
29
|
dql_parallel as _dql_parallel,
|
|
30
|
+
dql_parallel_async as _dql_parallel_async,
|
|
28
31
|
dql_refresh_ontology as _dql_refresh_ontology,
|
|
29
32
|
)
|
|
33
|
+
from .ontology import Ontology
|
|
30
34
|
from .web_search import (
|
|
31
35
|
WEB_SEARCH_BASE,
|
|
32
36
|
web_search as _web_search,
|
|
@@ -48,8 +52,8 @@ class Diffbot:
|
|
|
48
52
|
"""Client for the Diffbot APIs.
|
|
49
53
|
|
|
50
54
|
Example:
|
|
51
|
-
>>> from diffbot import Diffbot
|
|
52
|
-
>>> db = Diffbot(token=
|
|
55
|
+
>>> from diffbot import Diffbot, resolve_token
|
|
56
|
+
>>> db = Diffbot(token=resolve_token()) # env var or ~/.diffbot/credentials
|
|
53
57
|
>>> db.extract("https://example.com")
|
|
54
58
|
"""
|
|
55
59
|
|
|
@@ -155,6 +159,10 @@ class Diffbot:
|
|
|
155
159
|
"""Download the Diffbot Knowledge Graph ontology and write it to dest."""
|
|
156
160
|
_dql_refresh_ontology(self, dest)
|
|
157
161
|
|
|
162
|
+
def dql_fetch_ontology(self) -> Ontology:
|
|
163
|
+
"""Download the ontology and return it as a queryable Ontology (no caching)."""
|
|
164
|
+
return _dql_fetch_ontology(self)
|
|
165
|
+
|
|
158
166
|
def web_search(self, text: str, *, num_results: Optional[int] = None, max_tokens: Optional[int] = None) -> Dict[str, Any]:
|
|
159
167
|
"""Search the web via the Diffbot LLM web search API."""
|
|
160
168
|
return _web_search(self, text, num_results=num_results, max_tokens=max_tokens)
|
|
@@ -272,6 +280,14 @@ class DiffbotAsync:
|
|
|
272
280
|
"""
|
|
273
281
|
return await _dql_async(self, query, size=size, from_=from_, format=format, filter=filter, exportspec=exportspec, extra=extra, raw=raw)
|
|
274
282
|
|
|
283
|
+
async def dql_parallel(self, queries: Sequence[Dict[str, Any]], *, workers: int = 8) -> List[Union[Dict[str, Any], bytes]]:
|
|
284
|
+
"""Run multiple DQL queries concurrently. Each item is a dict of dql() keyword args."""
|
|
285
|
+
return await _dql_parallel_async(self, queries, workers=workers)
|
|
286
|
+
|
|
287
|
+
async def dql_fetch_ontology(self) -> Ontology:
|
|
288
|
+
"""Download the ontology and return it as a queryable Ontology (no caching)."""
|
|
289
|
+
return await _dql_fetch_ontology_async(self)
|
|
290
|
+
|
|
275
291
|
async def web_search(self, text: str, *, num_results: Optional[int] = None, max_tokens: Optional[int] = None) -> Dict[str, Any]:
|
|
276
292
|
"""Search the web via the Diffbot LLM web search API."""
|
|
277
293
|
return await _web_search_async(self, text, num_results=num_results, max_tokens=max_tokens)
|
diffbot/kg.py
CHANGED
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
"""Diffbot Knowledge Graph APIs: DQL search and entity enhancement."""
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import pathlib
|
|
4
5
|
from concurrent.futures import ThreadPoolExecutor
|
|
5
6
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Union
|
|
6
7
|
|
|
8
|
+
from .ontology import Ontology
|
|
9
|
+
|
|
7
10
|
if TYPE_CHECKING:
|
|
8
11
|
from .client import Diffbot, DiffbotAsync
|
|
9
12
|
|
|
@@ -83,8 +86,43 @@ def dql_parallel(
|
|
|
83
86
|
return list(ex.map(lambda q: dql(client, **q), queries))
|
|
84
87
|
|
|
85
88
|
|
|
89
|
+
async def dql_parallel_async(
|
|
90
|
+
client: "DiffbotAsync",
|
|
91
|
+
queries: Sequence[Dict[str, Any]],
|
|
92
|
+
*,
|
|
93
|
+
workers: int = 8,
|
|
94
|
+
) -> List[Union[Dict[str, Any], bytes]]:
|
|
95
|
+
if not queries:
|
|
96
|
+
return []
|
|
97
|
+
sem = asyncio.Semaphore(workers)
|
|
98
|
+
|
|
99
|
+
async def _one(q: Dict[str, Any]) -> Union[Dict[str, Any], bytes]:
|
|
100
|
+
async with sem:
|
|
101
|
+
return await dql_async(client, **q)
|
|
102
|
+
|
|
103
|
+
return await asyncio.gather(*(_one(q) for q in queries))
|
|
104
|
+
|
|
105
|
+
|
|
86
106
|
def dql_refresh_ontology(client: "Diffbot", dest: pathlib.Path) -> None:
|
|
87
107
|
response = client._http.get(KG_ONTOLOGY_ENDPOINT)
|
|
88
108
|
client._raise_for_status(response)
|
|
89
109
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
90
110
|
dest.write_bytes(response.content)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def dql_fetch_ontology(client: "Diffbot") -> Ontology:
|
|
114
|
+
"""Download the ontology and return it as a queryable :class:`Ontology`.
|
|
115
|
+
|
|
116
|
+
Performs no caching — the caller decides whether and where to hold onto the
|
|
117
|
+
result. Use :func:`dql_refresh_ontology` instead to persist raw bytes to disk.
|
|
118
|
+
"""
|
|
119
|
+
response = client._http.get(KG_ONTOLOGY_ENDPOINT)
|
|
120
|
+
client._raise_for_status(response)
|
|
121
|
+
return Ontology.from_json(response.content)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
async def dql_fetch_ontology_async(client: "DiffbotAsync") -> Ontology:
|
|
125
|
+
"""Async variant of :func:`dql_fetch_ontology`."""
|
|
126
|
+
response = await client._http.get(KG_ONTOLOGY_ENDPOINT)
|
|
127
|
+
client._raise_for_status(response)
|
|
128
|
+
return Ontology.from_json(response.content)
|
diffbot/ontology.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""In-memory navigation of the Diffbot Knowledge Graph ontology.
|
|
2
|
+
|
|
3
|
+
The ontology is a JSON document describing the Knowledge Graph's entity types,
|
|
4
|
+
composite types, enums, and taxonomies. An agent constructing DQL needs it to
|
|
5
|
+
look up real field paths and taxonomy values instead of guessing them.
|
|
6
|
+
|
|
7
|
+
This module is pure and storage-agnostic: build an :class:`Ontology` from
|
|
8
|
+
already-parsed data (or from raw JSON / a file path) and query it. How the
|
|
9
|
+
ontology document is fetched, and whether or where it is cached, is left
|
|
10
|
+
entirely to the caller — the `db` CLI caches it on disk at
|
|
11
|
+
``~/.diffbot/ontology.json``; an in-process consumer (e.g. langchain) can cache
|
|
12
|
+
the :class:`Ontology` in memory. Fetch a fresh one over HTTP with
|
|
13
|
+
:meth:`diffbot.Diffbot.dql_fetch_ontology`.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import pathlib
|
|
18
|
+
import re
|
|
19
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Ontology:
|
|
23
|
+
"""Queryable view over a parsed Diffbot ontology document.
|
|
24
|
+
|
|
25
|
+
The instance holds the parsed document on :attr:`data` and exposes pure
|
|
26
|
+
lookup methods over it. Nothing here performs I/O — construct with already
|
|
27
|
+
parsed data, or use :meth:`from_json` / :meth:`from_path` for convenience.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, data: Dict[str, Any]):
|
|
31
|
+
self.data = data
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def from_json(cls, raw: Union[str, bytes]) -> "Ontology":
|
|
35
|
+
"""Build from a raw JSON string or bytes (e.g. an HTTP response body)."""
|
|
36
|
+
return cls(json.loads(raw))
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def from_path(cls, path: Union[str, pathlib.Path]) -> "Ontology":
|
|
40
|
+
"""Build from a JSON file on disk."""
|
|
41
|
+
return cls(json.loads(pathlib.Path(path).read_text()))
|
|
42
|
+
|
|
43
|
+
def types(self) -> List[str]:
|
|
44
|
+
"""All entity type names (e.g. ``Organization``, ``Person``)."""
|
|
45
|
+
return sorted(self.data.get("types", {}).keys())
|
|
46
|
+
|
|
47
|
+
def composites(self) -> List[str]:
|
|
48
|
+
"""All composite type names (e.g. ``Location``, ``Employment``)."""
|
|
49
|
+
return sorted(self.data.get("composites", {}).keys())
|
|
50
|
+
|
|
51
|
+
def enums(self) -> List[str]:
|
|
52
|
+
"""All enum type names (e.g. ``Language``, ``Gender``)."""
|
|
53
|
+
return sorted(self.data.get("enums", {}).keys())
|
|
54
|
+
|
|
55
|
+
def taxonomies(self) -> List[str]:
|
|
56
|
+
"""All taxonomy names (e.g. ``OrganizationCategory``)."""
|
|
57
|
+
return sorted(self.data.get("taxonomies", {}).keys())
|
|
58
|
+
|
|
59
|
+
@staticmethod
|
|
60
|
+
def _fields_of(container: Dict[str, Any], type_name: str) -> Dict[str, Any]:
|
|
61
|
+
entry = container.get(type_name)
|
|
62
|
+
if entry is None:
|
|
63
|
+
raise KeyError(f"Unknown name: {type_name}")
|
|
64
|
+
return entry.get("fields", {})
|
|
65
|
+
|
|
66
|
+
def fields_for(self, type_name: str) -> Dict[str, Any]:
|
|
67
|
+
"""Return the field map of an entity type or composite.
|
|
68
|
+
|
|
69
|
+
Auto-routes: ``type_name`` may be an entity type (``Organization``) or a
|
|
70
|
+
composite (``Location``). Raises ``KeyError`` if it is neither.
|
|
71
|
+
"""
|
|
72
|
+
types = self.data.get("types", {})
|
|
73
|
+
composites = self.data.get("composites", {})
|
|
74
|
+
if type_name in types:
|
|
75
|
+
return self._fields_of(types, type_name)
|
|
76
|
+
if type_name in composites:
|
|
77
|
+
return self._fields_of(composites, type_name)
|
|
78
|
+
raise KeyError(f"{type_name} is not a known entity type or composite")
|
|
79
|
+
|
|
80
|
+
@staticmethod
|
|
81
|
+
def filter_fields(
|
|
82
|
+
fields: Dict[str, Any],
|
|
83
|
+
search: Optional[str],
|
|
84
|
+
include_deprecated: bool = False,
|
|
85
|
+
) -> List[Tuple[str, Dict[str, Any]]]:
|
|
86
|
+
"""Filter a field map by a name regex, dropping deprecated by default."""
|
|
87
|
+
pattern = re.compile(search, re.IGNORECASE) if search else None
|
|
88
|
+
out = []
|
|
89
|
+
for name, meta in fields.items():
|
|
90
|
+
if not include_deprecated and meta.get("isDeprecated"):
|
|
91
|
+
continue
|
|
92
|
+
if pattern and not pattern.search(name):
|
|
93
|
+
continue
|
|
94
|
+
out.append((name, meta))
|
|
95
|
+
return out
|
|
96
|
+
|
|
97
|
+
def taxonomy_values(self, name: str, search: Optional[str] = None) -> List[str]:
|
|
98
|
+
"""Flatten a taxonomy's values (recursing into children), optionally filtered."""
|
|
99
|
+
tax = self.data.get("taxonomies", {}).get(name)
|
|
100
|
+
if tax is None:
|
|
101
|
+
raise KeyError(f"Unknown taxonomy: {name}")
|
|
102
|
+
pattern = re.compile(search, re.IGNORECASE) if search else None
|
|
103
|
+
out: List[str] = []
|
|
104
|
+
|
|
105
|
+
def walk(node: Dict[str, Any]) -> None:
|
|
106
|
+
n = node.get("name")
|
|
107
|
+
if n and (pattern is None or pattern.search(n)):
|
|
108
|
+
out.append(n)
|
|
109
|
+
for child in node.get("children", []) or []:
|
|
110
|
+
walk(child)
|
|
111
|
+
|
|
112
|
+
for cat in tax.get("categories", []) or []:
|
|
113
|
+
walk(cat)
|
|
114
|
+
return out
|
|
115
|
+
|
|
116
|
+
def enum_values(self, name: str) -> List[str]:
|
|
117
|
+
"""Return the allowed values of an enum."""
|
|
118
|
+
enum = self.data.get("enums", {}).get(name)
|
|
119
|
+
if enum is None:
|
|
120
|
+
raise KeyError(f"Unknown enum: {name}")
|
|
121
|
+
return list(enum.get("values", []))
|
|
122
|
+
|
|
123
|
+
def find_named(self, search: str) -> List[str]:
|
|
124
|
+
"""Fallback search: every ``name`` anywhere in the document matching a regex."""
|
|
125
|
+
pattern = re.compile(search, re.IGNORECASE)
|
|
126
|
+
found = set()
|
|
127
|
+
|
|
128
|
+
def walk(node: Any) -> None:
|
|
129
|
+
if isinstance(node, dict):
|
|
130
|
+
n = node.get("name")
|
|
131
|
+
if isinstance(n, str) and pattern.search(n):
|
|
132
|
+
found.add(n)
|
|
133
|
+
for v in node.values():
|
|
134
|
+
walk(v)
|
|
135
|
+
elif isinstance(node, list):
|
|
136
|
+
for v in node:
|
|
137
|
+
walk(v)
|
|
138
|
+
|
|
139
|
+
walk(self.data)
|
|
140
|
+
return sorted(found)
|
|
141
|
+
|
|
142
|
+
@staticmethod
|
|
143
|
+
def format_field(name: str, meta: Dict[str, Any]) -> str:
|
|
144
|
+
"""Render one field as ``<name>: [<type>] [flags...]`` for display."""
|
|
145
|
+
t = meta.get("type", "?")
|
|
146
|
+
if t == "LinkedEntity":
|
|
147
|
+
le = meta.get("leType") or []
|
|
148
|
+
if le:
|
|
149
|
+
t = f"LinkedEntity ({le[0]})"
|
|
150
|
+
flags = []
|
|
151
|
+
if meta.get("isList"):
|
|
152
|
+
flags.append("isList")
|
|
153
|
+
if meta.get("isComposite"):
|
|
154
|
+
flags.append("isComposite")
|
|
155
|
+
if meta.get("isEnum"):
|
|
156
|
+
flags.append("isEnum")
|
|
157
|
+
if meta.get("isDeprecated"):
|
|
158
|
+
flags.append("DEPRECATED")
|
|
159
|
+
suffix = "".join(f" [{f}]" for f in flags)
|
|
160
|
+
return f"{name}: [{t}]{suffix}"
|
diffbot/web_search.py
CHANGED
|
@@ -18,7 +18,7 @@ def web_search(
|
|
|
18
18
|
headers = {"Authorization": f"Bearer {client.token}"}
|
|
19
19
|
params: Dict[str, Any] = {"text": text}
|
|
20
20
|
if num_results is not None:
|
|
21
|
-
params["
|
|
21
|
+
params["size"] = num_results
|
|
22
22
|
if max_tokens is not None:
|
|
23
23
|
params["maxTokens"] = max_tokens
|
|
24
24
|
response = client._http.get(client.web_search_url, headers=headers, params=params)
|
|
@@ -36,7 +36,7 @@ async def web_search_async(
|
|
|
36
36
|
headers = {"Authorization": f"Bearer {client.token}"}
|
|
37
37
|
params: Dict[str, Any] = {"text": text}
|
|
38
38
|
if num_results is not None:
|
|
39
|
-
params["
|
|
39
|
+
params["size"] = num_results
|
|
40
40
|
if max_tokens is not None:
|
|
41
41
|
params["maxTokens"] = max_tokens
|
|
42
42
|
response = await client._http.get(client.web_search_url, headers=headers, params=params)
|
|
@@ -1,18 +1,30 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: diffbot-python
|
|
3
|
-
Version: 0.1
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Python client library for Diffbot APIs
|
|
5
5
|
Project-URL: Homepage, https://github.com/diffbot/diffbot-python
|
|
6
|
+
Project-URL: Documentation, https://github.com/diffbot/diffbot-python#readme
|
|
6
7
|
Project-URL: Repository, https://github.com/diffbot/diffbot-python
|
|
7
8
|
Project-URL: Issues, https://github.com/diffbot/diffbot-python/issues
|
|
8
9
|
Author-email: Jerome Choo <jerome@diffbot.com>, Mike Tung <miket@diffbot.com>
|
|
9
10
|
License-Expression: MIT
|
|
10
11
|
License-File: LICENSE
|
|
12
|
+
Keywords: api-client,crawler,diffbot,extract,knowledge-graph,llm,nlp,web-scraping
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
11
15
|
Classifier: Operating System :: OS Independent
|
|
12
16
|
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
13
23
|
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
14
24
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
-
Classifier: Topic :: Software Development :: Libraries
|
|
25
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
26
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
27
|
+
Classifier: Typing :: Typed
|
|
16
28
|
Requires-Python: >=3.10
|
|
17
29
|
Requires-Dist: click>=8.1.0
|
|
18
30
|
Requires-Dist: httpx>=0.27.0
|
|
@@ -29,7 +41,7 @@ Python client library for [Diffbot](https://www.diffbot.com) APIs.
|
|
|
29
41
|
## Installation
|
|
30
42
|
|
|
31
43
|
```bash
|
|
32
|
-
pip install
|
|
44
|
+
python3 -m pip install diffbot-python
|
|
33
45
|
```
|
|
34
46
|
|
|
35
47
|
Or, for local development:
|
|
@@ -41,12 +53,38 @@ pip install -e ".[dev]"
|
|
|
41
53
|
## Usage
|
|
42
54
|
|
|
43
55
|
### Authentication
|
|
44
|
-
|
|
56
|
+
|
|
57
|
+
The CLI and the library can share a single credential. The token always has to be
|
|
58
|
+
passed to the client explicitly, but `resolve_token()` gives you the same lookup the
|
|
59
|
+
CLI uses, in this order:
|
|
60
|
+
|
|
61
|
+
1. An explicit token passed to `resolve_token(token)`.
|
|
62
|
+
2. The `DIFFBOT_API_TOKEN` environment variable.
|
|
63
|
+
3. A `DIFFBOT_API_TOKEN=...` line in `~/.diffbot/credentials`.
|
|
64
|
+
|
|
65
|
+
Set it once and it works for both the CLI and your scripts. Either export it:
|
|
45
66
|
|
|
46
67
|
```bash
|
|
47
68
|
export DIFFBOT_API_TOKEN=<TOKEN>
|
|
48
69
|
```
|
|
49
70
|
|
|
71
|
+
…or write it to the shared credentials file (handy for keeping it out of your shell environment):
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
mkdir -p ~/.diffbot
|
|
75
|
+
printf 'DIFFBOT_API_TOKEN=%s\n' '<TOKEN>' > ~/.diffbot/credentials
|
|
76
|
+
chmod 600 ~/.diffbot/credentials
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
With either in place, resolve the token and pass it to the client:
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from diffbot import Diffbot, resolve_token
|
|
83
|
+
|
|
84
|
+
db = Diffbot(token=resolve_token()) # from env var or ~/.diffbot/credentials
|
|
85
|
+
data = db.extract("https://www.example.com")
|
|
86
|
+
```
|
|
87
|
+
|
|
50
88
|
### Extract structured content
|
|
51
89
|
```python
|
|
52
90
|
from diffbot import Diffbot
|
|
@@ -189,7 +227,15 @@ asyncio.run(main())
|
|
|
189
227
|
|
|
190
228
|
## CLI
|
|
191
229
|
|
|
192
|
-
This library also includes a CLI.
|
|
230
|
+
This library also includes a CLI exposed as the `db` command.
|
|
231
|
+
|
|
232
|
+
To make `db` available from anywhere, install it as an isolated tool with [uv](https://docs.astral.sh/uv/):
|
|
233
|
+
|
|
234
|
+
```bash
|
|
235
|
+
uv tool install .
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
This drops a `db` executable into `~/.local/bin` (ensure it is on your `PATH`). Use `--force` to reinstall or upgrade after changes, or `--editable` to have source edits take effect immediately. Alternatively, a plain `pip install .` (or `pip install -e .`) also installs the `db` entry point into the active environment.
|
|
193
239
|
|
|
194
240
|
```bash
|
|
195
241
|
export DIFFBOT_API_TOKEN=your-token-here
|
|
@@ -212,7 +258,9 @@ Run the mock test suite:
|
|
|
212
258
|
python -m pytest
|
|
213
259
|
```
|
|
214
260
|
|
|
215
|
-
Run live integration tests against the real API (requires a valid token)
|
|
261
|
+
Run live integration tests against the real API (requires a valid token).
|
|
262
|
+
The token is resolved the same way as everywhere else — the `DIFFBOT_API_TOKEN`
|
|
263
|
+
environment variable or `~/.diffbot/credentials`:
|
|
216
264
|
```bash
|
|
217
|
-
|
|
265
|
+
DIFFBOT_API_TOKEN=your_token python -m pytest -m live
|
|
218
266
|
```
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
diffbot/__init__.py,sha256=OUDi3bVvKowQAQB4m8I4t761ct4pC-3de461qPI4Zok,827
|
|
2
|
+
diffbot/_auth.py,sha256=dzNMWqIjtG3YtRBb_EKpPRiBxvZgogxvoE0ABLvzfOk,1251
|
|
3
|
+
diffbot/ask.py,sha256=iNv613j4CoIfdDTOE-pl9KUkjqI-2AxGDMR1prm3DGM,1853
|
|
4
|
+
diffbot/client.py,sha256=G6vYUEDEaIqjJXcIRxAjwS_cvaXvM7KdQFVMIedkgUY,11315
|
|
5
|
+
diffbot/crawl.py,sha256=iYMFmf7HKrbefJrGg14VnlfiBFLOE_Z1pfO4Rn_cDXc,8893
|
|
6
|
+
diffbot/errors.py,sha256=5-AceX5MyNVUhe9pvR_4rnQQmBhvLfwWmrRl7dRZUSg,1576
|
|
7
|
+
diffbot/extract.py,sha256=R9SVxaOi4FjHOQIX5ho_75OwaJ7VdX-mdAoV_UY-lrM,1452
|
|
8
|
+
diffbot/kg.py,sha256=my-5vR4Vbe3LHzBDYVHLHo99KTtyiRXCT5625glsrlQ,3930
|
|
9
|
+
diffbot/nlp.py,sha256=lZJW4MkjhVklIEM2OBfhc8LvresXTy7RsuImPteYsOA,1153
|
|
10
|
+
diffbot/ontology.py,sha256=GWD5m2rz7ECWY6RrO1TeN3kk3DYTgmEHz6gFKASMvAU,6283
|
|
11
|
+
diffbot/web_search.py,sha256=RSaEK0pdAdLgtilc500Mf2MNmJ1DmiYYmIlBQBWvOpE,1356
|
|
12
|
+
diffbot/cli/__init__.py,sha256=UVGD3uevKTHmqEdbDhhR2PzO6-3i0xu8d7D_94jLLRo,16488
|
|
13
|
+
diffbot/cli/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
|
|
14
|
+
diffbot/cli/_common.py,sha256=_s8WRukzSX3krjJW6HJmT7_3hEjNyv1i_y45tm81b2A,654
|
|
15
|
+
diffbot/cli/dql.py,sha256=iIMaGysMWTScnbAHAdWei2iVUJpMV5lA1pk1E9uQg4s,11439
|
|
16
|
+
diffbot/cli/entities.py,sha256=tsHKexF0b6NnsoUEZJUS2rJidTIE8lt6xrcSenPbwtY,5630
|
|
17
|
+
diffbot/cli/ontology.py,sha256=cLuRQ0KTsOJnFCXdG_zIX9J1ODK8ThPznWVDDEpO1pg,2055
|
|
18
|
+
diffbot_python-0.2.1.dist-info/METADATA,sha256=xNpRnsnbGtPxU-zUAPwozdiHprTBS82zjUD87BMO36U,7402
|
|
19
|
+
diffbot_python-0.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
20
|
+
diffbot_python-0.2.1.dist-info/entry_points.txt,sha256=FCxJhrbl7VNEsTK7zl7qYvXID7gQ1_wxhiw5_Vllb_M,40
|
|
21
|
+
diffbot_python-0.2.1.dist-info/licenses/LICENSE,sha256=UZlamI1XGeiG0Mit8dsHssNhOuMGKfmNOp5qpf1533w,1063
|
|
22
|
+
diffbot_python-0.2.1.dist-info/RECORD,,
|
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
diffbot/__init__.py,sha256=WyzW2kAw75vgEmAyvrFkfcu_m0gL8uhOixeYVCKaHf4,518
|
|
2
|
-
diffbot/ask.py,sha256=iNv613j4CoIfdDTOE-pl9KUkjqI-2AxGDMR1prm3DGM,1853
|
|
3
|
-
diffbot/client.py,sha256=C5MIhyx8YZGgD4GjKj5QrIg-eRtUF3qhmUzYW77Sh9Q,10426
|
|
4
|
-
diffbot/crawl.py,sha256=iYMFmf7HKrbefJrGg14VnlfiBFLOE_Z1pfO4Rn_cDXc,8893
|
|
5
|
-
diffbot/errors.py,sha256=5-AceX5MyNVUhe9pvR_4rnQQmBhvLfwWmrRl7dRZUSg,1576
|
|
6
|
-
diffbot/extract.py,sha256=R9SVxaOi4FjHOQIX5ho_75OwaJ7VdX-mdAoV_UY-lrM,1452
|
|
7
|
-
diffbot/kg.py,sha256=Y7XTrPNAfPdX9vvhFmgmU4G4KTF9fwpYkm2Hh3c6DLA,2708
|
|
8
|
-
diffbot/nlp.py,sha256=lZJW4MkjhVklIEM2OBfhc8LvresXTy7RsuImPteYsOA,1153
|
|
9
|
-
diffbot/web_search.py,sha256=1sKBojzsslZj2zzl2kJ4s43AUaRn1i1fGgPUjFdsW6Q,1370
|
|
10
|
-
diffbot/cli/__init__.py,sha256=UVGD3uevKTHmqEdbDhhR2PzO6-3i0xu8d7D_94jLLRo,16488
|
|
11
|
-
diffbot/cli/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
|
|
12
|
-
diffbot/cli/_common.py,sha256=0I-oHnKVM9zLCaVNDc-7qf17b5u827IK7ezXITjsdW4,1006
|
|
13
|
-
diffbot/cli/dql.py,sha256=lJzAEjTIoF1l1xTwMcVIlq36pKZKrvgZYz0RuX5jUGc,11419
|
|
14
|
-
diffbot/cli/entities.py,sha256=tsHKexF0b6NnsoUEZJUS2rJidTIE8lt6xrcSenPbwtY,5630
|
|
15
|
-
diffbot/cli/ontology.py,sha256=FLIIe6ZY34zLHLt_bB9Zci0zlrKEAIIgNSdz5KwXqzw,3773
|
|
16
|
-
diffbot_python-0.1.0.dist-info/METADATA,sha256=KrMqxyqa2g6GPee1zc8A6J1SPXDLukELFslwbuMj9CM,5281
|
|
17
|
-
diffbot_python-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
18
|
-
diffbot_python-0.1.0.dist-info/entry_points.txt,sha256=FCxJhrbl7VNEsTK7zl7qYvXID7gQ1_wxhiw5_Vllb_M,40
|
|
19
|
-
diffbot_python-0.1.0.dist-info/licenses/LICENSE,sha256=UZlamI1XGeiG0Mit8dsHssNhOuMGKfmNOp5qpf1533w,1063
|
|
20
|
-
diffbot_python-0.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|