diffbot-python 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
diffbot/__init__.py CHANGED
@@ -4,6 +4,7 @@ diffbot - Python client library for the Diffbot APIs.
4
4
 
5
5
  __version__ = "0.1.0"
6
6
 
7
+ from ._auth import resolve_token
7
8
  from .client import Diffbot, DiffbotAsync
8
9
  from .crawl import CrawlEvent, CrawlEventType
9
10
  from .errors import (
@@ -14,12 +15,15 @@ from .errors import (
14
15
  RateLimitError,
15
16
  ValidationError,
16
17
  )
18
+ from .ontology import Ontology
17
19
 
18
20
  __all__ = [
19
21
  "Diffbot",
20
22
  "DiffbotAsync",
23
+ "resolve_token",
21
24
  "CrawlEvent",
22
25
  "CrawlEventType",
26
+ "Ontology",
23
27
  "DiffbotError",
24
28
  "AuthError",
25
29
  "ExtractionError",
diffbot/_auth.py ADDED
@@ -0,0 +1,41 @@
1
+ """Shared Diffbot credential resolution for both the library and the CLI.
2
+
3
+ The same lookup chain is used everywhere so a single credential works for the
4
+ ``db`` CLI and any Python script that constructs a client:
5
+
6
+ 1. An explicit token passed to the client / function.
7
+ 2. The ``DIFFBOT_API_TOKEN`` environment variable.
8
+ 3. A ``DIFFBOT_API_TOKEN=...`` line in ``~/.diffbot/credentials``.
9
+ """
10
+
11
+ import os
12
+ import pathlib
13
+ from typing import Optional
14
+
15
+ TOKEN_ENV_VAR = "DIFFBOT_API_TOKEN"
16
+ CREDENTIALS_PATH = pathlib.Path.home() / ".diffbot" / "credentials"
17
+
18
+
19
+ def _read_credentials_file() -> str:
20
+ if not CREDENTIALS_PATH.exists():
21
+ return ""
22
+ for line in CREDENTIALS_PATH.read_text().splitlines():
23
+ line = line.strip()
24
+ if line.startswith(f"{TOKEN_ENV_VAR}="):
25
+ return line[len(TOKEN_ENV_VAR) + 1:].strip()
26
+ return ""
27
+
28
+
29
+ def resolve_token(token: Optional[str] = None) -> str:
30
+ """Resolve a Diffbot API token from the explicit argument, env var, or file.
31
+
32
+ Returns an empty string if no token can be found.
33
+ """
34
+ if token and token.strip():
35
+ return token.strip()
36
+
37
+ env_token = os.environ.get(TOKEN_ENV_VAR, "").strip()
38
+ if env_token:
39
+ return env_token
40
+
41
+ return _read_credentials_file()
diffbot/cli/_common.py CHANGED
@@ -1,35 +1,20 @@
1
- import os
2
- import pathlib
3
-
4
1
  import click
5
2
 
6
- from diffbot import Diffbot
7
-
8
- CREDENTIALS_PATH = pathlib.Path.home() / ".diffbot" / "credentials"
9
-
10
-
11
- def resolve_token() -> str:
12
- """Return the Diffbot API token from the env var, falling back to ~/.diffbot/credentials."""
13
- token = os.environ.get("DIFFBOT_API_TOKEN", "").strip()
14
- if token:
15
- return token
16
-
17
- if CREDENTIALS_PATH.exists():
18
- for line in CREDENTIALS_PATH.read_text().splitlines():
19
- line = line.strip()
20
- if line.startswith("DIFFBOT_API_TOKEN="):
21
- return line[len("DIFFBOT_API_TOKEN="):].strip()
22
-
23
- return ""
3
+ from diffbot import Diffbot, resolve_token
4
+ from diffbot._auth import CREDENTIALS_PATH, TOKEN_ENV_VAR
24
5
 
25
6
 
26
7
  def get_client() -> Diffbot:
8
+ """Build a Diffbot client using the shared credential resolution chain.
9
+
10
+ Looks at the DIFFBOT_API_TOKEN env var, then ~/.diffbot/credentials.
11
+ """
27
12
  token = resolve_token()
28
13
  if not token:
29
14
  click.echo(
30
15
  "Error: no Diffbot API token found.\n"
31
- " Set a DIFFBOT_API_TOKEN environment variable, or\n"
32
- f" write 'DIFFBOT_API_TOKEN=YOUR_TOKEN' to {CREDENTIALS_PATH}",
16
+ f" Set a {TOKEN_ENV_VAR} environment variable, or\n"
17
+ f" write '{TOKEN_ENV_VAR}=YOUR_TOKEN' to {CREDENTIALS_PATH}",
33
18
  err=True,
34
19
  )
35
20
  raise click.Abort()
diffbot/cli/dql.py CHANGED
@@ -15,7 +15,9 @@ from rich.table import Table
15
15
  from diffbot import DiffbotError
16
16
 
17
17
  from . import ontology
18
- from ._common import get_client, resolve_token
18
+ from diffbot import resolve_token
19
+
20
+ from ._common import get_client
19
21
 
20
22
 
21
23
  class _DqlGroup(click.Group):
diffbot/cli/ontology.py CHANGED
@@ -1,14 +1,24 @@
1
+ """CLI-side ontology access: a disk cache over the storage-agnostic core.
2
+
3
+ The navigation logic lives in :mod:`diffbot.ontology` (the `Ontology` class).
4
+ This module adds the CLI's caching policy on top: the ontology is read once from
5
+ ``~/.diffbot/ontology.json`` (populated by `db dql init`) and held in
6
+ ``_CACHE``. The module-level functions preserve the historical CLI surface and
7
+ simply delegate to an `Ontology` built from the cached document.
8
+ """
9
+
1
10
  import json
2
11
  import pathlib
3
- import re
4
- from typing import Any, Dict, List, Optional
12
+ from typing import Any, Dict, List, Optional, Tuple
13
+
14
+ from diffbot.ontology import Ontology
5
15
 
6
16
  ONTOLOGY_PATH = pathlib.Path.home() / ".diffbot" / "ontology.json"
7
17
 
8
18
  _CACHE: Dict[str, Any] = {}
9
19
 
10
20
 
11
- def load() -> Dict[str, Any]:
21
+ def _data() -> Dict[str, Any]:
12
22
  if "data" not in _CACHE:
13
23
  if not ONTOLOGY_PATH.exists():
14
24
  raise FileNotFoundError(
@@ -18,113 +28,47 @@ def load() -> Dict[str, Any]:
18
28
  return _CACHE["data"]
19
29
 
20
30
 
31
+ def _ontology() -> Ontology:
32
+ return Ontology(_data())
33
+
34
+
21
35
  def list_types() -> List[str]:
22
- return sorted(load().get("types", {}).keys())
36
+ return _ontology().types()
23
37
 
24
38
 
25
39
  def list_composites() -> List[str]:
26
- return sorted(load().get("composites", {}).keys())
40
+ return _ontology().composites()
27
41
 
28
42
 
29
43
  def list_enums() -> List[str]:
30
- return sorted(load().get("enums", {}).keys())
44
+ return _ontology().enums()
31
45
 
32
46
 
33
47
  def list_taxonomies() -> List[str]:
34
- return sorted(load().get("taxonomies", {}).keys())
35
-
36
-
37
- def _fields_of(container: Dict[str, Any], type_name: str) -> Dict[str, Any]:
38
- entry = container.get(type_name)
39
- if entry is None:
40
- raise KeyError(f"Unknown name: {type_name}")
41
- return entry.get("fields", {})
48
+ return _ontology().taxonomies()
42
49
 
43
50
 
44
51
  def fields_for(type_name: str) -> Dict[str, Any]:
45
- data = load()
46
- types = data.get("types", {})
47
- composites = data.get("composites", {})
48
- if type_name in types:
49
- return _fields_of(types, type_name)
50
- if type_name in composites:
51
- return _fields_of(composites, type_name)
52
- raise KeyError(f"{type_name} is not a known entity type or composite")
52
+ return _ontology().fields_for(type_name)
53
53
 
54
54
 
55
55
  def format_field(name: str, meta: Dict[str, Any]) -> str:
56
- t = meta.get("type", "?")
57
- if t == "LinkedEntity":
58
- le = meta.get("leType") or []
59
- if le:
60
- t = f"LinkedEntity ({le[0]})"
61
- flags = []
62
- if meta.get("isList"):
63
- flags.append("isList")
64
- if meta.get("isComposite"):
65
- flags.append("isComposite")
66
- if meta.get("isEnum"):
67
- flags.append("isEnum")
68
- if meta.get("isDeprecated"):
69
- flags.append("DEPRECATED")
70
- suffix = "".join(f" [{f}]" for f in flags)
71
- return f"{name}: [{t}]{suffix}"
72
-
73
-
74
- def filter_fields(fields: Dict[str, Any], search: Optional[str], include_deprecated: bool = False) -> List[tuple]:
75
- pattern = re.compile(search, re.IGNORECASE) if search else None
76
- out = []
77
- for name, meta in fields.items():
78
- if not include_deprecated and meta.get("isDeprecated"):
79
- continue
80
- if pattern and not pattern.search(name):
81
- continue
82
- out.append((name, meta))
83
- return out
56
+ return Ontology.format_field(name, meta)
84
57
 
85
58
 
86
- def taxonomy_values(name: str, search: Optional[str] = None) -> List[str]:
87
- data = load()
88
- tax = data.get("taxonomies", {}).get(name)
89
- if tax is None:
90
- raise KeyError(f"Unknown taxonomy: {name}")
91
- pattern = re.compile(search, re.IGNORECASE) if search else None
92
- out: List[str] = []
59
+ def filter_fields(
60
+ fields: Dict[str, Any], search: Optional[str], include_deprecated: bool = False
61
+ ) -> List[Tuple[str, Dict[str, Any]]]:
62
+ return Ontology.filter_fields(fields, search, include_deprecated=include_deprecated)
93
63
 
94
- def walk(node: Dict[str, Any]) -> None:
95
- n = node.get("name")
96
- if n and (pattern is None or pattern.search(n)):
97
- out.append(n)
98
- for child in node.get("children", []) or []:
99
- walk(child)
100
64
 
101
- for cat in tax.get("categories", []) or []:
102
- walk(cat)
103
- return out
65
+ def taxonomy_values(name: str, search: Optional[str] = None) -> List[str]:
66
+ return _ontology().taxonomy_values(name, search)
104
67
 
105
68
 
106
69
  def enum_values(name: str) -> List[str]:
107
- data = load()
108
- enum = data.get("enums", {}).get(name)
109
- if enum is None:
110
- raise KeyError(f"Unknown enum: {name}")
111
- return list(enum.get("values", []))
70
+ return _ontology().enum_values(name)
112
71
 
113
72
 
114
73
  def find_named(search: str) -> List[str]:
115
- pattern = re.compile(search, re.IGNORECASE)
116
- found = set()
117
-
118
- def walk(node: Any) -> None:
119
- if isinstance(node, dict):
120
- n = node.get("name")
121
- if isinstance(n, str) and pattern.search(n):
122
- found.add(n)
123
- for v in node.values():
124
- walk(v)
125
- elif isinstance(node, list):
126
- for v in node:
127
- walk(v)
128
-
129
- walk(load())
130
- return sorted(found)
74
+ return _ontology().find_named(search)
diffbot/client.py CHANGED
@@ -24,9 +24,13 @@ from .crawl import (
24
24
  from .kg import (
25
25
  dql as _dql,
26
26
  dql_async as _dql_async,
27
+ dql_fetch_ontology as _dql_fetch_ontology,
28
+ dql_fetch_ontology_async as _dql_fetch_ontology_async,
27
29
  dql_parallel as _dql_parallel,
30
+ dql_parallel_async as _dql_parallel_async,
28
31
  dql_refresh_ontology as _dql_refresh_ontology,
29
32
  )
33
+ from .ontology import Ontology
30
34
  from .web_search import (
31
35
  WEB_SEARCH_BASE,
32
36
  web_search as _web_search,
@@ -48,8 +52,8 @@ class Diffbot:
48
52
  """Client for the Diffbot APIs.
49
53
 
50
54
  Example:
51
- >>> from diffbot import Diffbot
52
- >>> db = Diffbot(token=os.getenv("DIFFBOT_API_TOKEN"))
55
+ >>> from diffbot import Diffbot, resolve_token
56
+ >>> db = Diffbot(token=resolve_token()) # env var or ~/.diffbot/credentials
53
57
  >>> db.extract("https://example.com")
54
58
  """
55
59
 
@@ -155,6 +159,10 @@ class Diffbot:
155
159
  """Download the Diffbot Knowledge Graph ontology and write it to dest."""
156
160
  _dql_refresh_ontology(self, dest)
157
161
 
162
+ def dql_fetch_ontology(self) -> Ontology:
163
+ """Download the ontology and return it as a queryable Ontology (no caching)."""
164
+ return _dql_fetch_ontology(self)
165
+
158
166
  def web_search(self, text: str, *, num_results: Optional[int] = None, max_tokens: Optional[int] = None) -> Dict[str, Any]:
159
167
  """Search the web via the Diffbot LLM web search API."""
160
168
  return _web_search(self, text, num_results=num_results, max_tokens=max_tokens)
@@ -272,6 +280,14 @@ class DiffbotAsync:
272
280
  """
273
281
  return await _dql_async(self, query, size=size, from_=from_, format=format, filter=filter, exportspec=exportspec, extra=extra, raw=raw)
274
282
 
283
+ async def dql_parallel(self, queries: Sequence[Dict[str, Any]], *, workers: int = 8) -> List[Union[Dict[str, Any], bytes]]:
284
+ """Run multiple DQL queries concurrently. Each item is a dict of dql() keyword args."""
285
+ return await _dql_parallel_async(self, queries, workers=workers)
286
+
287
+ async def dql_fetch_ontology(self) -> Ontology:
288
+ """Download the ontology and return it as a queryable Ontology (no caching)."""
289
+ return await _dql_fetch_ontology_async(self)
290
+
275
291
  async def web_search(self, text: str, *, num_results: Optional[int] = None, max_tokens: Optional[int] = None) -> Dict[str, Any]:
276
292
  """Search the web via the Diffbot LLM web search API."""
277
293
  return await _web_search_async(self, text, num_results=num_results, max_tokens=max_tokens)
diffbot/kg.py CHANGED
@@ -1,9 +1,12 @@
1
1
  """Diffbot Knowledge Graph APIs: DQL search and entity enhancement."""
2
2
 
3
+ import asyncio
3
4
  import pathlib
4
5
  from concurrent.futures import ThreadPoolExecutor
5
6
  from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Union
6
7
 
8
+ from .ontology import Ontology
9
+
7
10
  if TYPE_CHECKING:
8
11
  from .client import Diffbot, DiffbotAsync
9
12
 
@@ -83,8 +86,43 @@ def dql_parallel(
83
86
  return list(ex.map(lambda q: dql(client, **q), queries))
84
87
 
85
88
 
89
+ async def dql_parallel_async(
90
+ client: "DiffbotAsync",
91
+ queries: Sequence[Dict[str, Any]],
92
+ *,
93
+ workers: int = 8,
94
+ ) -> List[Union[Dict[str, Any], bytes]]:
95
+ if not queries:
96
+ return []
97
+ sem = asyncio.Semaphore(workers)
98
+
99
+ async def _one(q: Dict[str, Any]) -> Union[Dict[str, Any], bytes]:
100
+ async with sem:
101
+ return await dql_async(client, **q)
102
+
103
+ return await asyncio.gather(*(_one(q) for q in queries))
104
+
105
+
86
106
  def dql_refresh_ontology(client: "Diffbot", dest: pathlib.Path) -> None:
87
107
  response = client._http.get(KG_ONTOLOGY_ENDPOINT)
88
108
  client._raise_for_status(response)
89
109
  dest.parent.mkdir(parents=True, exist_ok=True)
90
110
  dest.write_bytes(response.content)
111
+
112
+
113
+ def dql_fetch_ontology(client: "Diffbot") -> Ontology:
114
+ """Download the ontology and return it as a queryable :class:`Ontology`.
115
+
116
+ Performs no caching — the caller decides whether and where to hold onto the
117
+ result. Use :func:`dql_refresh_ontology` instead to persist raw bytes to disk.
118
+ """
119
+ response = client._http.get(KG_ONTOLOGY_ENDPOINT)
120
+ client._raise_for_status(response)
121
+ return Ontology.from_json(response.content)
122
+
123
+
124
+ async def dql_fetch_ontology_async(client: "DiffbotAsync") -> Ontology:
125
+ """Async variant of :func:`dql_fetch_ontology`."""
126
+ response = await client._http.get(KG_ONTOLOGY_ENDPOINT)
127
+ client._raise_for_status(response)
128
+ return Ontology.from_json(response.content)
diffbot/ontology.py ADDED
@@ -0,0 +1,160 @@
1
+ """In-memory navigation of the Diffbot Knowledge Graph ontology.
2
+
3
+ The ontology is a JSON document describing the Knowledge Graph's entity types,
4
+ composite types, enums, and taxonomies. An agent constructing DQL needs it to
5
+ look up real field paths and taxonomy values instead of guessing them.
6
+
7
+ This module is pure and storage-agnostic: build an :class:`Ontology` from
8
+ already-parsed data (or from raw JSON / a file path) and query it. How the
9
+ ontology document is fetched, and whether or where it is cached, is left
10
+ entirely to the caller — the `db` CLI caches it on disk at
11
+ ``~/.diffbot/ontology.json``; an in-process consumer (e.g. langchain) can cache
12
+ the :class:`Ontology` in memory. Fetch a fresh one over HTTP with
13
+ :meth:`diffbot.Diffbot.dql_fetch_ontology`.
14
+ """
15
+
16
+ import json
17
+ import pathlib
18
+ import re
19
+ from typing import Any, Dict, List, Optional, Tuple, Union
20
+
21
+
22
+ class Ontology:
23
+ """Queryable view over a parsed Diffbot ontology document.
24
+
25
+ The instance holds the parsed document on :attr:`data` and exposes pure
26
+ lookup methods over it. Nothing here performs I/O — construct with already
27
+ parsed data, or use :meth:`from_json` / :meth:`from_path` for convenience.
28
+ """
29
+
30
+ def __init__(self, data: Dict[str, Any]):
31
+ self.data = data
32
+
33
+ @classmethod
34
+ def from_json(cls, raw: Union[str, bytes]) -> "Ontology":
35
+ """Build from a raw JSON string or bytes (e.g. an HTTP response body)."""
36
+ return cls(json.loads(raw))
37
+
38
+ @classmethod
39
+ def from_path(cls, path: Union[str, pathlib.Path]) -> "Ontology":
40
+ """Build from a JSON file on disk."""
41
+ return cls(json.loads(pathlib.Path(path).read_text()))
42
+
43
+ def types(self) -> List[str]:
44
+ """All entity type names (e.g. ``Organization``, ``Person``)."""
45
+ return sorted(self.data.get("types", {}).keys())
46
+
47
+ def composites(self) -> List[str]:
48
+ """All composite type names (e.g. ``Location``, ``Employment``)."""
49
+ return sorted(self.data.get("composites", {}).keys())
50
+
51
+ def enums(self) -> List[str]:
52
+ """All enum type names (e.g. ``Language``, ``Gender``)."""
53
+ return sorted(self.data.get("enums", {}).keys())
54
+
55
+ def taxonomies(self) -> List[str]:
56
+ """All taxonomy names (e.g. ``OrganizationCategory``)."""
57
+ return sorted(self.data.get("taxonomies", {}).keys())
58
+
59
+ @staticmethod
60
+ def _fields_of(container: Dict[str, Any], type_name: str) -> Dict[str, Any]:
61
+ entry = container.get(type_name)
62
+ if entry is None:
63
+ raise KeyError(f"Unknown name: {type_name}")
64
+ return entry.get("fields", {})
65
+
66
+ def fields_for(self, type_name: str) -> Dict[str, Any]:
67
+ """Return the field map of an entity type or composite.
68
+
69
+ Auto-routes: ``type_name`` may be an entity type (``Organization``) or a
70
+ composite (``Location``). Raises ``KeyError`` if it is neither.
71
+ """
72
+ types = self.data.get("types", {})
73
+ composites = self.data.get("composites", {})
74
+ if type_name in types:
75
+ return self._fields_of(types, type_name)
76
+ if type_name in composites:
77
+ return self._fields_of(composites, type_name)
78
+ raise KeyError(f"{type_name} is not a known entity type or composite")
79
+
80
+ @staticmethod
81
+ def filter_fields(
82
+ fields: Dict[str, Any],
83
+ search: Optional[str],
84
+ include_deprecated: bool = False,
85
+ ) -> List[Tuple[str, Dict[str, Any]]]:
86
+ """Filter a field map by a name regex, dropping deprecated by default."""
87
+ pattern = re.compile(search, re.IGNORECASE) if search else None
88
+ out = []
89
+ for name, meta in fields.items():
90
+ if not include_deprecated and meta.get("isDeprecated"):
91
+ continue
92
+ if pattern and not pattern.search(name):
93
+ continue
94
+ out.append((name, meta))
95
+ return out
96
+
97
+ def taxonomy_values(self, name: str, search: Optional[str] = None) -> List[str]:
98
+ """Flatten a taxonomy's values (recursing into children), optionally filtered."""
99
+ tax = self.data.get("taxonomies", {}).get(name)
100
+ if tax is None:
101
+ raise KeyError(f"Unknown taxonomy: {name}")
102
+ pattern = re.compile(search, re.IGNORECASE) if search else None
103
+ out: List[str] = []
104
+
105
+ def walk(node: Dict[str, Any]) -> None:
106
+ n = node.get("name")
107
+ if n and (pattern is None or pattern.search(n)):
108
+ out.append(n)
109
+ for child in node.get("children", []) or []:
110
+ walk(child)
111
+
112
+ for cat in tax.get("categories", []) or []:
113
+ walk(cat)
114
+ return out
115
+
116
+ def enum_values(self, name: str) -> List[str]:
117
+ """Return the allowed values of an enum."""
118
+ enum = self.data.get("enums", {}).get(name)
119
+ if enum is None:
120
+ raise KeyError(f"Unknown enum: {name}")
121
+ return list(enum.get("values", []))
122
+
123
+ def find_named(self, search: str) -> List[str]:
124
+ """Fallback search: every ``name`` anywhere in the document matching a regex."""
125
+ pattern = re.compile(search, re.IGNORECASE)
126
+ found = set()
127
+
128
+ def walk(node: Any) -> None:
129
+ if isinstance(node, dict):
130
+ n = node.get("name")
131
+ if isinstance(n, str) and pattern.search(n):
132
+ found.add(n)
133
+ for v in node.values():
134
+ walk(v)
135
+ elif isinstance(node, list):
136
+ for v in node:
137
+ walk(v)
138
+
139
+ walk(self.data)
140
+ return sorted(found)
141
+
142
+ @staticmethod
143
+ def format_field(name: str, meta: Dict[str, Any]) -> str:
144
+ """Render one field as ``<name>: [<type>] [flags...]`` for display."""
145
+ t = meta.get("type", "?")
146
+ if t == "LinkedEntity":
147
+ le = meta.get("leType") or []
148
+ if le:
149
+ t = f"LinkedEntity ({le[0]})"
150
+ flags = []
151
+ if meta.get("isList"):
152
+ flags.append("isList")
153
+ if meta.get("isComposite"):
154
+ flags.append("isComposite")
155
+ if meta.get("isEnum"):
156
+ flags.append("isEnum")
157
+ if meta.get("isDeprecated"):
158
+ flags.append("DEPRECATED")
159
+ suffix = "".join(f" [{f}]" for f in flags)
160
+ return f"{name}: [{t}]{suffix}"
diffbot/web_search.py CHANGED
@@ -18,7 +18,7 @@ def web_search(
18
18
  headers = {"Authorization": f"Bearer {client.token}"}
19
19
  params: Dict[str, Any] = {"text": text}
20
20
  if num_results is not None:
21
- params["num_results"] = num_results
21
+ params["size"] = num_results
22
22
  if max_tokens is not None:
23
23
  params["maxTokens"] = max_tokens
24
24
  response = client._http.get(client.web_search_url, headers=headers, params=params)
@@ -36,7 +36,7 @@ async def web_search_async(
36
36
  headers = {"Authorization": f"Bearer {client.token}"}
37
37
  params: Dict[str, Any] = {"text": text}
38
38
  if num_results is not None:
39
- params["num_results"] = num_results
39
+ params["size"] = num_results
40
40
  if max_tokens is not None:
41
41
  params["maxTokens"] = max_tokens
42
42
  response = await client._http.get(client.web_search_url, headers=headers, params=params)
@@ -1,18 +1,30 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: diffbot-python
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Python client library for Diffbot APIs
5
5
  Project-URL: Homepage, https://github.com/diffbot/diffbot-python
6
+ Project-URL: Documentation, https://github.com/diffbot/diffbot-python#readme
6
7
  Project-URL: Repository, https://github.com/diffbot/diffbot-python
7
8
  Project-URL: Issues, https://github.com/diffbot/diffbot-python/issues
8
9
  Author-email: Jerome Choo <jerome@diffbot.com>, Mike Tung <miket@diffbot.com>
9
10
  License-Expression: MIT
10
11
  License-File: LICENSE
12
+ Keywords: api-client,crawler,diffbot,extract,knowledge-graph,llm,nlp,web-scraping
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
11
15
  Classifier: Operating System :: OS Independent
12
16
  Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3 :: Only
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Internet :: WWW/HTTP
13
23
  Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
14
24
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
- Classifier: Topic :: Software Development :: Libraries
25
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
26
+ Classifier: Topic :: Text Processing :: Markup :: HTML
27
+ Classifier: Typing :: Typed
16
28
  Requires-Python: >=3.10
17
29
  Requires-Dist: click>=8.1.0
18
30
  Requires-Dist: httpx>=0.27.0
@@ -29,7 +41,7 @@ Python client library for [Diffbot](https://www.diffbot.com) APIs.
29
41
  ## Installation
30
42
 
31
43
  ```bash
32
- pip install git+https://github.com/diffbot/diffbot-python.git
44
+ python3 -m pip install diffbot-python
33
45
  ```
34
46
 
35
47
  Or, for local development:
@@ -41,12 +53,38 @@ pip install -e ".[dev]"
41
53
  ## Usage
42
54
 
43
55
  ### Authentication
44
- Set your Diffbot API token in your environment or .env.
56
+
57
+ The CLI and the library can share a single credential. The token always has to be
58
+ passed to the client explicitly, but `resolve_token()` gives you the same lookup the
59
+ CLI uses, in this order:
60
+
61
+ 1. An explicit token passed to `resolve_token(token)`.
62
+ 2. The `DIFFBOT_API_TOKEN` environment variable.
63
+ 3. A `DIFFBOT_API_TOKEN=...` line in `~/.diffbot/credentials`.
64
+
65
+ Set it once and it works for both the CLI and your scripts. Either export it:
45
66
 
46
67
  ```bash
47
68
  export DIFFBOT_API_TOKEN=<TOKEN>
48
69
  ```
49
70
 
71
+ …or write it to the shared credentials file (handy for keeping it out of your shell environment):
72
+
73
+ ```bash
74
+ mkdir -p ~/.diffbot
75
+ printf 'DIFFBOT_API_TOKEN=%s\n' '<TOKEN>' > ~/.diffbot/credentials
76
+ chmod 600 ~/.diffbot/credentials
77
+ ```
78
+
79
+ With either in place, resolve the token and pass it to the client:
80
+
81
+ ```python
82
+ from diffbot import Diffbot, resolve_token
83
+
84
+ db = Diffbot(token=resolve_token()) # from env var or ~/.diffbot/credentials
85
+ data = db.extract("https://www.example.com")
86
+ ```
87
+
50
88
  ### Extract structured content
51
89
  ```python
52
90
  from diffbot import Diffbot
@@ -189,7 +227,15 @@ asyncio.run(main())
189
227
 
190
228
  ## CLI
191
229
 
192
- This library also includes a CLI.
230
+ This library also includes a CLI exposed as the `db` command.
231
+
232
+ To make `db` available from anywhere, install it as an isolated tool with [uv](https://docs.astral.sh/uv/):
233
+
234
+ ```bash
235
+ uv tool install .
236
+ ```
237
+
238
+ This drops a `db` executable into `~/.local/bin` (ensure it is on your `PATH`). Use `--force` to reinstall or upgrade after changes, or `--editable` to have source edits take effect immediately. Alternatively, a plain `pip install .` (or `pip install -e .`) also installs the `db` entry point into the active environment.
193
239
 
194
240
  ```bash
195
241
  export DIFFBOT_API_TOKEN=your-token-here
@@ -212,7 +258,9 @@ Run the mock test suite:
212
258
  python -m pytest
213
259
  ```
214
260
 
215
- Run live integration tests against the real API (requires a valid token):
261
+ Run live integration tests against the real API (requires a valid token).
262
+ The token is resolved the same way as everywhere else — the `DIFFBOT_API_TOKEN`
263
+ environment variable or `~/.diffbot/credentials`:
216
264
  ```bash
217
- DIFFBOT_TOKEN=your_token python -m pytest -m live
265
+ DIFFBOT_API_TOKEN=your_token python -m pytest -m live
218
266
  ```
@@ -0,0 +1,22 @@
1
+ diffbot/__init__.py,sha256=hyIdyzIjXYvM2XXE20bDggIcJkTC7aR5s--LdLGFXOI,619
2
+ diffbot/_auth.py,sha256=dzNMWqIjtG3YtRBb_EKpPRiBxvZgogxvoE0ABLvzfOk,1251
3
+ diffbot/ask.py,sha256=iNv613j4CoIfdDTOE-pl9KUkjqI-2AxGDMR1prm3DGM,1853
4
+ diffbot/client.py,sha256=G6vYUEDEaIqjJXcIRxAjwS_cvaXvM7KdQFVMIedkgUY,11315
5
+ diffbot/crawl.py,sha256=iYMFmf7HKrbefJrGg14VnlfiBFLOE_Z1pfO4Rn_cDXc,8893
6
+ diffbot/errors.py,sha256=5-AceX5MyNVUhe9pvR_4rnQQmBhvLfwWmrRl7dRZUSg,1576
7
+ diffbot/extract.py,sha256=R9SVxaOi4FjHOQIX5ho_75OwaJ7VdX-mdAoV_UY-lrM,1452
8
+ diffbot/kg.py,sha256=my-5vR4Vbe3LHzBDYVHLHo99KTtyiRXCT5625glsrlQ,3930
9
+ diffbot/nlp.py,sha256=lZJW4MkjhVklIEM2OBfhc8LvresXTy7RsuImPteYsOA,1153
10
+ diffbot/ontology.py,sha256=GWD5m2rz7ECWY6RrO1TeN3kk3DYTgmEHz6gFKASMvAU,6283
11
+ diffbot/web_search.py,sha256=RSaEK0pdAdLgtilc500Mf2MNmJ1DmiYYmIlBQBWvOpE,1356
12
+ diffbot/cli/__init__.py,sha256=UVGD3uevKTHmqEdbDhhR2PzO6-3i0xu8d7D_94jLLRo,16488
13
+ diffbot/cli/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
14
+ diffbot/cli/_common.py,sha256=_s8WRukzSX3krjJW6HJmT7_3hEjNyv1i_y45tm81b2A,654
15
+ diffbot/cli/dql.py,sha256=iIMaGysMWTScnbAHAdWei2iVUJpMV5lA1pk1E9uQg4s,11439
16
+ diffbot/cli/entities.py,sha256=tsHKexF0b6NnsoUEZJUS2rJidTIE8lt6xrcSenPbwtY,5630
17
+ diffbot/cli/ontology.py,sha256=cLuRQ0KTsOJnFCXdG_zIX9J1ODK8ThPznWVDDEpO1pg,2055
18
+ diffbot_python-0.2.0.dist-info/METADATA,sha256=LooJOiHWS4HP8JlLvackJ2JEauezwZ2qe39xCgyhk2c,7402
19
+ diffbot_python-0.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
20
+ diffbot_python-0.2.0.dist-info/entry_points.txt,sha256=FCxJhrbl7VNEsTK7zl7qYvXID7gQ1_wxhiw5_Vllb_M,40
21
+ diffbot_python-0.2.0.dist-info/licenses/LICENSE,sha256=UZlamI1XGeiG0Mit8dsHssNhOuMGKfmNOp5qpf1533w,1063
22
+ diffbot_python-0.2.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.30.1
2
+ Generator: hatchling 1.27.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,20 +0,0 @@
1
- diffbot/__init__.py,sha256=WyzW2kAw75vgEmAyvrFkfcu_m0gL8uhOixeYVCKaHf4,518
2
- diffbot/ask.py,sha256=iNv613j4CoIfdDTOE-pl9KUkjqI-2AxGDMR1prm3DGM,1853
3
- diffbot/client.py,sha256=C5MIhyx8YZGgD4GjKj5QrIg-eRtUF3qhmUzYW77Sh9Q,10426
4
- diffbot/crawl.py,sha256=iYMFmf7HKrbefJrGg14VnlfiBFLOE_Z1pfO4Rn_cDXc,8893
5
- diffbot/errors.py,sha256=5-AceX5MyNVUhe9pvR_4rnQQmBhvLfwWmrRl7dRZUSg,1576
6
- diffbot/extract.py,sha256=R9SVxaOi4FjHOQIX5ho_75OwaJ7VdX-mdAoV_UY-lrM,1452
7
- diffbot/kg.py,sha256=Y7XTrPNAfPdX9vvhFmgmU4G4KTF9fwpYkm2Hh3c6DLA,2708
8
- diffbot/nlp.py,sha256=lZJW4MkjhVklIEM2OBfhc8LvresXTy7RsuImPteYsOA,1153
9
- diffbot/web_search.py,sha256=1sKBojzsslZj2zzl2kJ4s43AUaRn1i1fGgPUjFdsW6Q,1370
10
- diffbot/cli/__init__.py,sha256=UVGD3uevKTHmqEdbDhhR2PzO6-3i0xu8d7D_94jLLRo,16488
11
- diffbot/cli/__main__.py,sha256=5BjNuyet8AY-POwoF5rGt722rHQ7tJ0Vf0UFUfzzi-I,58
12
- diffbot/cli/_common.py,sha256=0I-oHnKVM9zLCaVNDc-7qf17b5u827IK7ezXITjsdW4,1006
13
- diffbot/cli/dql.py,sha256=lJzAEjTIoF1l1xTwMcVIlq36pKZKrvgZYz0RuX5jUGc,11419
14
- diffbot/cli/entities.py,sha256=tsHKexF0b6NnsoUEZJUS2rJidTIE8lt6xrcSenPbwtY,5630
15
- diffbot/cli/ontology.py,sha256=FLIIe6ZY34zLHLt_bB9Zci0zlrKEAIIgNSdz5KwXqzw,3773
16
- diffbot_python-0.1.0.dist-info/METADATA,sha256=KrMqxyqa2g6GPee1zc8A6J1SPXDLukELFslwbuMj9CM,5281
17
- diffbot_python-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
18
- diffbot_python-0.1.0.dist-info/entry_points.txt,sha256=FCxJhrbl7VNEsTK7zl7qYvXID7gQ1_wxhiw5_Vllb_M,40
19
- diffbot_python-0.1.0.dist-info/licenses/LICENSE,sha256=UZlamI1XGeiG0Mit8dsHssNhOuMGKfmNOp5qpf1533w,1063
20
- diffbot_python-0.1.0.dist-info/RECORD,,