diffbot-python 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/.gitignore +3 -0
- diffbot_python-0.1.0/README.md → diffbot_python-0.2.0/PKG-INFO +76 -5
- diffbot_python-0.1.0/PKG-INFO → diffbot_python-0.2.0/README.md +41 -28
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/pyproject.toml +40 -10
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/__init__.py +4 -0
- diffbot_python-0.2.0/src/diffbot/_auth.py +41 -0
- diffbot_python-0.2.0/src/diffbot/cli/_common.py +21 -0
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/cli/dql.py +3 -1
- diffbot_python-0.2.0/src/diffbot/cli/ontology.py +74 -0
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/client.py +18 -2
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/kg.py +38 -0
- diffbot_python-0.2.0/src/diffbot/ontology.py +160 -0
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/web_search.py +2 -2
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/tests/conftest.py +3 -5
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/tests/test_extract.py +39 -1
- diffbot_python-0.2.0/tests/test_ontology.py +119 -0
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/tests/test_readme_examples.py +19 -1
- diffbot_python-0.1.0/AGENTS.md +0 -5
- diffbot_python-0.1.0/CLAUDE.md +0 -1
- diffbot_python-0.1.0/src/diffbot/cli/_common.py +0 -36
- diffbot_python-0.1.0/src/diffbot/cli/ontology.py +0 -130
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/LICENSE +0 -0
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/ask.py +0 -0
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/cli/__init__.py +0 -0
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/cli/__main__.py +0 -0
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/cli/entities.py +0 -0
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/crawl.py +0 -0
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/errors.py +0 -0
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/extract.py +0 -0
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/nlp.py +0 -0
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/tests/test_ask.py +0 -0
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/tests/test_async.py +0 -0
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/tests/test_crawl.py +0 -0
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/tests/test_dql.py +0 -0
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/tests/test_dql_cli.py +0 -0
- {diffbot_python-0.1.0 → diffbot_python-0.2.0}/tests/test_web_search.py +0 -0
|
@@ -1,3 +1,38 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: diffbot-python
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Python client library for Diffbot APIs
|
|
5
|
+
Project-URL: Homepage, https://github.com/diffbot/diffbot-python
|
|
6
|
+
Project-URL: Documentation, https://github.com/diffbot/diffbot-python#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/diffbot/diffbot-python
|
|
8
|
+
Project-URL: Issues, https://github.com/diffbot/diffbot-python/issues
|
|
9
|
+
Author-email: Jerome Choo <jerome@diffbot.com>, Mike Tung <miket@diffbot.com>
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: api-client,crawler,diffbot,extract,knowledge-graph,llm,nlp,web-scraping
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
23
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
25
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
26
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
27
|
+
Classifier: Typing :: Typed
|
|
28
|
+
Requires-Python: >=3.10
|
|
29
|
+
Requires-Dist: click>=8.1.0
|
|
30
|
+
Requires-Dist: httpx>=0.27.0
|
|
31
|
+
Requires-Dist: rich>=13.0.0
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
35
|
+
|
|
1
36
|
# Diffbot Python Library
|
|
2
37
|
|
|
3
38
|
Python client library for [Diffbot](https://www.diffbot.com) APIs.
|
|
@@ -6,7 +41,7 @@ Python client library for [Diffbot](https://www.diffbot.com) APIs.
|
|
|
6
41
|
## Installation
|
|
7
42
|
|
|
8
43
|
```bash
|
|
9
|
-
pip install
|
|
44
|
+
python3 -m pip install diffbot-python
|
|
10
45
|
```
|
|
11
46
|
|
|
12
47
|
Or, for local development:
|
|
@@ -18,12 +53,38 @@ pip install -e ".[dev]"
|
|
|
18
53
|
## Usage
|
|
19
54
|
|
|
20
55
|
### Authentication
|
|
21
|
-
|
|
56
|
+
|
|
57
|
+
The CLI and the library can share a single credential. The token always has to be
|
|
58
|
+
passed to the client explicitly, but `resolve_token()` gives you the same lookup the
|
|
59
|
+
CLI uses, in this order:
|
|
60
|
+
|
|
61
|
+
1. An explicit token passed to `resolve_token(token)`.
|
|
62
|
+
2. The `DIFFBOT_API_TOKEN` environment variable.
|
|
63
|
+
3. A `DIFFBOT_API_TOKEN=...` line in `~/.diffbot/credentials`.
|
|
64
|
+
|
|
65
|
+
Set it once and it works for both the CLI and your scripts. Either export it:
|
|
22
66
|
|
|
23
67
|
```bash
|
|
24
68
|
export DIFFBOT_API_TOKEN=<TOKEN>
|
|
25
69
|
```
|
|
26
70
|
|
|
71
|
+
…or write it to the shared credentials file (handy for keeping it out of your shell environment):
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
mkdir -p ~/.diffbot
|
|
75
|
+
printf 'DIFFBOT_API_TOKEN=%s\n' '<TOKEN>' > ~/.diffbot/credentials
|
|
76
|
+
chmod 600 ~/.diffbot/credentials
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
With either in place, resolve the token and pass it to the client:
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from diffbot import Diffbot, resolve_token
|
|
83
|
+
|
|
84
|
+
db = Diffbot(token=resolve_token()) # from env var or ~/.diffbot/credentials
|
|
85
|
+
data = db.extract("https://www.example.com")
|
|
86
|
+
```
|
|
87
|
+
|
|
27
88
|
### Extract structured content
|
|
28
89
|
```python
|
|
29
90
|
from diffbot import Diffbot
|
|
@@ -166,7 +227,15 @@ asyncio.run(main())
|
|
|
166
227
|
|
|
167
228
|
## CLI
|
|
168
229
|
|
|
169
|
-
This library also includes a CLI.
|
|
230
|
+
This library also includes a CLI exposed as the `db` command.
|
|
231
|
+
|
|
232
|
+
To make `db` available from anywhere, install it as an isolated tool with [uv](https://docs.astral.sh/uv/):
|
|
233
|
+
|
|
234
|
+
```bash
|
|
235
|
+
uv tool install .
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
This drops a `db` executable into `~/.local/bin` (ensure it is on your `PATH`). Use `--force` to reinstall or upgrade after changes, or `--editable` to have source edits take effect immediately. Alternatively, a plain `pip install .` (or `pip install -e .`) also installs the `db` entry point into the active environment.
|
|
170
239
|
|
|
171
240
|
```bash
|
|
172
241
|
export DIFFBOT_API_TOKEN=your-token-here
|
|
@@ -189,7 +258,9 @@ Run the mock test suite:
|
|
|
189
258
|
python -m pytest
|
|
190
259
|
```
|
|
191
260
|
|
|
192
|
-
Run live integration tests against the real API (requires a valid token)
|
|
261
|
+
Run live integration tests against the real API (requires a valid token).
|
|
262
|
+
The token is resolved the same way as everywhere else — the `DIFFBOT_API_TOKEN`
|
|
263
|
+
environment variable or `~/.diffbot/credentials`:
|
|
193
264
|
```bash
|
|
194
|
-
|
|
265
|
+
DIFFBOT_API_TOKEN=your_token python -m pytest -m live
|
|
195
266
|
```
|
|
@@ -1,26 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: diffbot-python
|
|
3
|
-
Version: 0.1.0
|
|
4
|
-
Summary: Python client library for Diffbot APIs
|
|
5
|
-
Project-URL: Homepage, https://github.com/diffbot/diffbot-python
|
|
6
|
-
Project-URL: Repository, https://github.com/diffbot/diffbot-python
|
|
7
|
-
Project-URL: Issues, https://github.com/diffbot/diffbot-python/issues
|
|
8
|
-
Author-email: Jerome Choo <jerome@diffbot.com>, Mike Tung <miket@diffbot.com>
|
|
9
|
-
License-Expression: MIT
|
|
10
|
-
License-File: LICENSE
|
|
11
|
-
Classifier: Operating System :: OS Independent
|
|
12
|
-
Classifier: Programming Language :: Python :: 3
|
|
13
|
-
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
14
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
-
Classifier: Topic :: Software Development :: Libraries
|
|
16
|
-
Requires-Python: >=3.10
|
|
17
|
-
Requires-Dist: click>=8.1.0
|
|
18
|
-
Requires-Dist: httpx>=0.27.0
|
|
19
|
-
Requires-Dist: rich>=13.0.0
|
|
20
|
-
Provides-Extra: dev
|
|
21
|
-
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
22
|
-
Description-Content-Type: text/markdown
|
|
23
|
-
|
|
24
1
|
# Diffbot Python Library
|
|
25
2
|
|
|
26
3
|
Python client library for [Diffbot](https://www.diffbot.com) APIs.
|
|
@@ -29,7 +6,7 @@ Python client library for [Diffbot](https://www.diffbot.com) APIs.
|
|
|
29
6
|
## Installation
|
|
30
7
|
|
|
31
8
|
```bash
|
|
32
|
-
pip install
|
|
9
|
+
python3 -m pip install diffbot-python
|
|
33
10
|
```
|
|
34
11
|
|
|
35
12
|
Or, for local development:
|
|
@@ -41,12 +18,38 @@ pip install -e ".[dev]"
|
|
|
41
18
|
## Usage
|
|
42
19
|
|
|
43
20
|
### Authentication
|
|
44
|
-
|
|
21
|
+
|
|
22
|
+
The CLI and the library can share a single credential. The token always has to be
|
|
23
|
+
passed to the client explicitly, but `resolve_token()` gives you the same lookup the
|
|
24
|
+
CLI uses, in this order:
|
|
25
|
+
|
|
26
|
+
1. An explicit token passed to `resolve_token(token)`.
|
|
27
|
+
2. The `DIFFBOT_API_TOKEN` environment variable.
|
|
28
|
+
3. A `DIFFBOT_API_TOKEN=...` line in `~/.diffbot/credentials`.
|
|
29
|
+
|
|
30
|
+
Set it once and it works for both the CLI and your scripts. Either export it:
|
|
45
31
|
|
|
46
32
|
```bash
|
|
47
33
|
export DIFFBOT_API_TOKEN=<TOKEN>
|
|
48
34
|
```
|
|
49
35
|
|
|
36
|
+
…or write it to the shared credentials file (handy for keeping it out of your shell environment):
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
mkdir -p ~/.diffbot
|
|
40
|
+
printf 'DIFFBOT_API_TOKEN=%s\n' '<TOKEN>' > ~/.diffbot/credentials
|
|
41
|
+
chmod 600 ~/.diffbot/credentials
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
With either in place, resolve the token and pass it to the client:
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from diffbot import Diffbot, resolve_token
|
|
48
|
+
|
|
49
|
+
db = Diffbot(token=resolve_token()) # from env var or ~/.diffbot/credentials
|
|
50
|
+
data = db.extract("https://www.example.com")
|
|
51
|
+
```
|
|
52
|
+
|
|
50
53
|
### Extract structured content
|
|
51
54
|
```python
|
|
52
55
|
from diffbot import Diffbot
|
|
@@ -189,7 +192,15 @@ asyncio.run(main())
|
|
|
189
192
|
|
|
190
193
|
## CLI
|
|
191
194
|
|
|
192
|
-
This library also includes a CLI.
|
|
195
|
+
This library also includes a CLI exposed as the `db` command.
|
|
196
|
+
|
|
197
|
+
To make `db` available from anywhere, install it as an isolated tool with [uv](https://docs.astral.sh/uv/):
|
|
198
|
+
|
|
199
|
+
```bash
|
|
200
|
+
uv tool install .
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
This drops a `db` executable into `~/.local/bin` (ensure it is on your `PATH`). Use `--force` to reinstall or upgrade after changes, or `--editable` to have source edits take effect immediately. Alternatively, a plain `pip install .` (or `pip install -e .`) also installs the `db` entry point into the active environment.
|
|
193
204
|
|
|
194
205
|
```bash
|
|
195
206
|
export DIFFBOT_API_TOKEN=your-token-here
|
|
@@ -212,7 +223,9 @@ Run the mock test suite:
|
|
|
212
223
|
python -m pytest
|
|
213
224
|
```
|
|
214
225
|
|
|
215
|
-
Run live integration tests against the real API (requires a valid token)
|
|
226
|
+
Run live integration tests against the real API (requires a valid token).
|
|
227
|
+
The token is resolved the same way as everywhere else — the `DIFFBOT_API_TOKEN`
|
|
228
|
+
environment variable or `~/.diffbot/credentials`:
|
|
216
229
|
```bash
|
|
217
|
-
|
|
230
|
+
DIFFBOT_API_TOKEN=your_token python -m pytest -m live
|
|
218
231
|
```
|
|
@@ -4,23 +4,43 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "diffbot-python"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.2.0"
|
|
8
8
|
description = "Python client library for Diffbot APIs"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
11
|
-
classifiers = [
|
|
12
|
-
"Programming Language :: Python :: 3",
|
|
13
|
-
"Operating System :: OS Independent",
|
|
14
|
-
"Topic :: Software Development :: Libraries",
|
|
15
|
-
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
16
|
-
"Topic :: Internet :: WWW/HTTP :: Indexing/Search"
|
|
17
|
-
]
|
|
18
11
|
license = "MIT"
|
|
19
12
|
license-files = ["LICEN[CS]E*"]
|
|
20
13
|
authors = [
|
|
21
14
|
{ name = "Jerome Choo", email = "jerome@diffbot.com" },
|
|
22
15
|
{ name = "Mike Tung", email = "miket@diffbot.com" }
|
|
23
16
|
]
|
|
17
|
+
keywords = [
|
|
18
|
+
"diffbot",
|
|
19
|
+
"knowledge-graph",
|
|
20
|
+
"web-scraping",
|
|
21
|
+
"extract",
|
|
22
|
+
"crawler",
|
|
23
|
+
"nlp",
|
|
24
|
+
"llm",
|
|
25
|
+
"api-client",
|
|
26
|
+
]
|
|
27
|
+
classifiers = [
|
|
28
|
+
"Development Status :: 3 - Alpha",
|
|
29
|
+
"Intended Audience :: Developers",
|
|
30
|
+
"Operating System :: OS Independent",
|
|
31
|
+
"Programming Language :: Python :: 3",
|
|
32
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
33
|
+
"Programming Language :: Python :: 3.10",
|
|
34
|
+
"Programming Language :: Python :: 3.11",
|
|
35
|
+
"Programming Language :: Python :: 3.12",
|
|
36
|
+
"Programming Language :: Python :: 3.13",
|
|
37
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
38
|
+
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
|
|
39
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
40
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
41
|
+
"Topic :: Text Processing :: Markup :: HTML",
|
|
42
|
+
"Typing :: Typed",
|
|
43
|
+
]
|
|
24
44
|
dependencies = [
|
|
25
45
|
"httpx>=0.27.0",
|
|
26
46
|
"click>=8.1.0",
|
|
@@ -34,6 +54,7 @@ dev = [
|
|
|
34
54
|
|
|
35
55
|
[project.urls]
|
|
36
56
|
Homepage = "https://github.com/diffbot/diffbot-python"
|
|
57
|
+
Documentation = "https://github.com/diffbot/diffbot-python#readme"
|
|
37
58
|
Repository = "https://github.com/diffbot/diffbot-python"
|
|
38
59
|
Issues = "https://github.com/diffbot/diffbot-python/issues"
|
|
39
60
|
|
|
@@ -43,6 +64,15 @@ db = "diffbot.cli:main"
|
|
|
43
64
|
[tool.hatch.build.targets.wheel]
|
|
44
65
|
packages = ["src/diffbot"]
|
|
45
66
|
|
|
67
|
+
[tool.hatch.build.targets.sdist]
|
|
68
|
+
include = [
|
|
69
|
+
"/src",
|
|
70
|
+
"/tests",
|
|
71
|
+
"/README.md",
|
|
72
|
+
"/LICENSE",
|
|
73
|
+
"/pyproject.toml",
|
|
74
|
+
]
|
|
75
|
+
|
|
46
76
|
[tool.pytest.ini_options]
|
|
47
|
-
markers = ["live: marks tests as live integration tests requiring a real
|
|
48
|
-
addopts = "-m 'not live'"
|
|
77
|
+
markers = ["live: marks tests as live integration tests requiring a real DIFFBOT_API_TOKEN"]
|
|
78
|
+
addopts = "-m 'not live'"
|
|
@@ -4,6 +4,7 @@ diffbot - Python client library for the Diffbot APIs.
|
|
|
4
4
|
|
|
5
5
|
__version__ = "0.1.0"
|
|
6
6
|
|
|
7
|
+
from ._auth import resolve_token
|
|
7
8
|
from .client import Diffbot, DiffbotAsync
|
|
8
9
|
from .crawl import CrawlEvent, CrawlEventType
|
|
9
10
|
from .errors import (
|
|
@@ -14,12 +15,15 @@ from .errors import (
|
|
|
14
15
|
RateLimitError,
|
|
15
16
|
ValidationError,
|
|
16
17
|
)
|
|
18
|
+
from .ontology import Ontology
|
|
17
19
|
|
|
18
20
|
__all__ = [
|
|
19
21
|
"Diffbot",
|
|
20
22
|
"DiffbotAsync",
|
|
23
|
+
"resolve_token",
|
|
21
24
|
"CrawlEvent",
|
|
22
25
|
"CrawlEventType",
|
|
26
|
+
"Ontology",
|
|
23
27
|
"DiffbotError",
|
|
24
28
|
"AuthError",
|
|
25
29
|
"ExtractionError",
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Shared Diffbot credential resolution for both the library and the CLI.
|
|
2
|
+
|
|
3
|
+
The same lookup chain is used everywhere so a single credential works for the
|
|
4
|
+
``db`` CLI and any Python script that constructs a client:
|
|
5
|
+
|
|
6
|
+
1. An explicit token passed to the client / function.
|
|
7
|
+
2. The ``DIFFBOT_API_TOKEN`` environment variable.
|
|
8
|
+
3. A ``DIFFBOT_API_TOKEN=...`` line in ``~/.diffbot/credentials``.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
import pathlib
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
TOKEN_ENV_VAR = "DIFFBOT_API_TOKEN"
|
|
16
|
+
CREDENTIALS_PATH = pathlib.Path.home() / ".diffbot" / "credentials"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _read_credentials_file() -> str:
|
|
20
|
+
if not CREDENTIALS_PATH.exists():
|
|
21
|
+
return ""
|
|
22
|
+
for line in CREDENTIALS_PATH.read_text().splitlines():
|
|
23
|
+
line = line.strip()
|
|
24
|
+
if line.startswith(f"{TOKEN_ENV_VAR}="):
|
|
25
|
+
return line[len(TOKEN_ENV_VAR) + 1:].strip()
|
|
26
|
+
return ""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def resolve_token(token: Optional[str] = None) -> str:
|
|
30
|
+
"""Resolve a Diffbot API token from the explicit argument, env var, or file.
|
|
31
|
+
|
|
32
|
+
Returns an empty string if no token can be found.
|
|
33
|
+
"""
|
|
34
|
+
if token and token.strip():
|
|
35
|
+
return token.strip()
|
|
36
|
+
|
|
37
|
+
env_token = os.environ.get(TOKEN_ENV_VAR, "").strip()
|
|
38
|
+
if env_token:
|
|
39
|
+
return env_token
|
|
40
|
+
|
|
41
|
+
return _read_credentials_file()
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import click
|
|
2
|
+
|
|
3
|
+
from diffbot import Diffbot, resolve_token
|
|
4
|
+
from diffbot._auth import CREDENTIALS_PATH, TOKEN_ENV_VAR
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_client() -> Diffbot:
|
|
8
|
+
"""Build a Diffbot client using the shared credential resolution chain.
|
|
9
|
+
|
|
10
|
+
Looks at the DIFFBOT_API_TOKEN env var, then ~/.diffbot/credentials.
|
|
11
|
+
"""
|
|
12
|
+
token = resolve_token()
|
|
13
|
+
if not token:
|
|
14
|
+
click.echo(
|
|
15
|
+
"Error: no Diffbot API token found.\n"
|
|
16
|
+
f" Set a {TOKEN_ENV_VAR} environment variable, or\n"
|
|
17
|
+
f" write '{TOKEN_ENV_VAR}=YOUR_TOKEN' to {CREDENTIALS_PATH}",
|
|
18
|
+
err=True,
|
|
19
|
+
)
|
|
20
|
+
raise click.Abort()
|
|
21
|
+
return Diffbot(token=token)
|
|
@@ -15,7 +15,9 @@ from rich.table import Table
|
|
|
15
15
|
from diffbot import DiffbotError
|
|
16
16
|
|
|
17
17
|
from . import ontology
|
|
18
|
-
from
|
|
18
|
+
from diffbot import resolve_token
|
|
19
|
+
|
|
20
|
+
from ._common import get_client
|
|
19
21
|
|
|
20
22
|
|
|
21
23
|
class _DqlGroup(click.Group):
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""CLI-side ontology access: a disk cache over the storage-agnostic core.
|
|
2
|
+
|
|
3
|
+
The navigation logic lives in :mod:`diffbot.ontology` (the `Ontology` class).
|
|
4
|
+
This module adds the CLI's caching policy on top: the ontology is read once from
|
|
5
|
+
``~/.diffbot/ontology.json`` (populated by `db dql init`) and held in
|
|
6
|
+
``_CACHE``. The module-level functions preserve the historical CLI surface and
|
|
7
|
+
simply delegate to an `Ontology` built from the cached document.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import pathlib
|
|
12
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
13
|
+
|
|
14
|
+
from diffbot.ontology import Ontology
|
|
15
|
+
|
|
16
|
+
ONTOLOGY_PATH = pathlib.Path.home() / ".diffbot" / "ontology.json"
|
|
17
|
+
|
|
18
|
+
_CACHE: Dict[str, Any] = {}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _data() -> Dict[str, Any]:
|
|
22
|
+
if "data" not in _CACHE:
|
|
23
|
+
if not ONTOLOGY_PATH.exists():
|
|
24
|
+
raise FileNotFoundError(
|
|
25
|
+
f"Ontology not found at {ONTOLOGY_PATH}. Run: db dql init"
|
|
26
|
+
)
|
|
27
|
+
_CACHE["data"] = json.loads(ONTOLOGY_PATH.read_text())
|
|
28
|
+
return _CACHE["data"]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _ontology() -> Ontology:
|
|
32
|
+
return Ontology(_data())
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def list_types() -> List[str]:
|
|
36
|
+
return _ontology().types()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def list_composites() -> List[str]:
|
|
40
|
+
return _ontology().composites()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def list_enums() -> List[str]:
|
|
44
|
+
return _ontology().enums()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def list_taxonomies() -> List[str]:
|
|
48
|
+
return _ontology().taxonomies()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def fields_for(type_name: str) -> Dict[str, Any]:
|
|
52
|
+
return _ontology().fields_for(type_name)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def format_field(name: str, meta: Dict[str, Any]) -> str:
|
|
56
|
+
return Ontology.format_field(name, meta)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def filter_fields(
|
|
60
|
+
fields: Dict[str, Any], search: Optional[str], include_deprecated: bool = False
|
|
61
|
+
) -> List[Tuple[str, Dict[str, Any]]]:
|
|
62
|
+
return Ontology.filter_fields(fields, search, include_deprecated=include_deprecated)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def taxonomy_values(name: str, search: Optional[str] = None) -> List[str]:
|
|
66
|
+
return _ontology().taxonomy_values(name, search)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def enum_values(name: str) -> List[str]:
|
|
70
|
+
return _ontology().enum_values(name)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def find_named(search: str) -> List[str]:
|
|
74
|
+
return _ontology().find_named(search)
|
|
@@ -24,9 +24,13 @@ from .crawl import (
|
|
|
24
24
|
from .kg import (
|
|
25
25
|
dql as _dql,
|
|
26
26
|
dql_async as _dql_async,
|
|
27
|
+
dql_fetch_ontology as _dql_fetch_ontology,
|
|
28
|
+
dql_fetch_ontology_async as _dql_fetch_ontology_async,
|
|
27
29
|
dql_parallel as _dql_parallel,
|
|
30
|
+
dql_parallel_async as _dql_parallel_async,
|
|
28
31
|
dql_refresh_ontology as _dql_refresh_ontology,
|
|
29
32
|
)
|
|
33
|
+
from .ontology import Ontology
|
|
30
34
|
from .web_search import (
|
|
31
35
|
WEB_SEARCH_BASE,
|
|
32
36
|
web_search as _web_search,
|
|
@@ -48,8 +52,8 @@ class Diffbot:
|
|
|
48
52
|
"""Client for the Diffbot APIs.
|
|
49
53
|
|
|
50
54
|
Example:
|
|
51
|
-
>>> from diffbot import Diffbot
|
|
52
|
-
>>> db = Diffbot(token=
|
|
55
|
+
>>> from diffbot import Diffbot, resolve_token
|
|
56
|
+
>>> db = Diffbot(token=resolve_token()) # env var or ~/.diffbot/credentials
|
|
53
57
|
>>> db.extract("https://example.com")
|
|
54
58
|
"""
|
|
55
59
|
|
|
@@ -155,6 +159,10 @@ class Diffbot:
|
|
|
155
159
|
"""Download the Diffbot Knowledge Graph ontology and write it to dest."""
|
|
156
160
|
_dql_refresh_ontology(self, dest)
|
|
157
161
|
|
|
162
|
+
def dql_fetch_ontology(self) -> Ontology:
|
|
163
|
+
"""Download the ontology and return it as a queryable Ontology (no caching)."""
|
|
164
|
+
return _dql_fetch_ontology(self)
|
|
165
|
+
|
|
158
166
|
def web_search(self, text: str, *, num_results: Optional[int] = None, max_tokens: Optional[int] = None) -> Dict[str, Any]:
|
|
159
167
|
"""Search the web via the Diffbot LLM web search API."""
|
|
160
168
|
return _web_search(self, text, num_results=num_results, max_tokens=max_tokens)
|
|
@@ -272,6 +280,14 @@ class DiffbotAsync:
|
|
|
272
280
|
"""
|
|
273
281
|
return await _dql_async(self, query, size=size, from_=from_, format=format, filter=filter, exportspec=exportspec, extra=extra, raw=raw)
|
|
274
282
|
|
|
283
|
+
async def dql_parallel(self, queries: Sequence[Dict[str, Any]], *, workers: int = 8) -> List[Union[Dict[str, Any], bytes]]:
|
|
284
|
+
"""Run multiple DQL queries concurrently. Each item is a dict of dql() keyword args."""
|
|
285
|
+
return await _dql_parallel_async(self, queries, workers=workers)
|
|
286
|
+
|
|
287
|
+
async def dql_fetch_ontology(self) -> Ontology:
|
|
288
|
+
"""Download the ontology and return it as a queryable Ontology (no caching)."""
|
|
289
|
+
return await _dql_fetch_ontology_async(self)
|
|
290
|
+
|
|
275
291
|
async def web_search(self, text: str, *, num_results: Optional[int] = None, max_tokens: Optional[int] = None) -> Dict[str, Any]:
|
|
276
292
|
"""Search the web via the Diffbot LLM web search API."""
|
|
277
293
|
return await _web_search_async(self, text, num_results=num_results, max_tokens=max_tokens)
|
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
"""Diffbot Knowledge Graph APIs: DQL search and entity enhancement."""
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import pathlib
|
|
4
5
|
from concurrent.futures import ThreadPoolExecutor
|
|
5
6
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Union
|
|
6
7
|
|
|
8
|
+
from .ontology import Ontology
|
|
9
|
+
|
|
7
10
|
if TYPE_CHECKING:
|
|
8
11
|
from .client import Diffbot, DiffbotAsync
|
|
9
12
|
|
|
@@ -83,8 +86,43 @@ def dql_parallel(
|
|
|
83
86
|
return list(ex.map(lambda q: dql(client, **q), queries))
|
|
84
87
|
|
|
85
88
|
|
|
89
|
+
async def dql_parallel_async(
|
|
90
|
+
client: "DiffbotAsync",
|
|
91
|
+
queries: Sequence[Dict[str, Any]],
|
|
92
|
+
*,
|
|
93
|
+
workers: int = 8,
|
|
94
|
+
) -> List[Union[Dict[str, Any], bytes]]:
|
|
95
|
+
if not queries:
|
|
96
|
+
return []
|
|
97
|
+
sem = asyncio.Semaphore(workers)
|
|
98
|
+
|
|
99
|
+
async def _one(q: Dict[str, Any]) -> Union[Dict[str, Any], bytes]:
|
|
100
|
+
async with sem:
|
|
101
|
+
return await dql_async(client, **q)
|
|
102
|
+
|
|
103
|
+
return await asyncio.gather(*(_one(q) for q in queries))
|
|
104
|
+
|
|
105
|
+
|
|
86
106
|
def dql_refresh_ontology(client: "Diffbot", dest: pathlib.Path) -> None:
|
|
87
107
|
response = client._http.get(KG_ONTOLOGY_ENDPOINT)
|
|
88
108
|
client._raise_for_status(response)
|
|
89
109
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
90
110
|
dest.write_bytes(response.content)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def dql_fetch_ontology(client: "Diffbot") -> Ontology:
|
|
114
|
+
"""Download the ontology and return it as a queryable :class:`Ontology`.
|
|
115
|
+
|
|
116
|
+
Performs no caching — the caller decides whether and where to hold onto the
|
|
117
|
+
result. Use :func:`dql_refresh_ontology` instead to persist raw bytes to disk.
|
|
118
|
+
"""
|
|
119
|
+
response = client._http.get(KG_ONTOLOGY_ENDPOINT)
|
|
120
|
+
client._raise_for_status(response)
|
|
121
|
+
return Ontology.from_json(response.content)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
async def dql_fetch_ontology_async(client: "DiffbotAsync") -> Ontology:
|
|
125
|
+
"""Async variant of :func:`dql_fetch_ontology`."""
|
|
126
|
+
response = await client._http.get(KG_ONTOLOGY_ENDPOINT)
|
|
127
|
+
client._raise_for_status(response)
|
|
128
|
+
return Ontology.from_json(response.content)
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""In-memory navigation of the Diffbot Knowledge Graph ontology.
|
|
2
|
+
|
|
3
|
+
The ontology is a JSON document describing the Knowledge Graph's entity types,
|
|
4
|
+
composite types, enums, and taxonomies. An agent constructing DQL needs it to
|
|
5
|
+
look up real field paths and taxonomy values instead of guessing them.
|
|
6
|
+
|
|
7
|
+
This module is pure and storage-agnostic: build an :class:`Ontology` from
|
|
8
|
+
already-parsed data (or from raw JSON / a file path) and query it. How the
|
|
9
|
+
ontology document is fetched, and whether or where it is cached, is left
|
|
10
|
+
entirely to the caller — the `db` CLI caches it on disk at
|
|
11
|
+
``~/.diffbot/ontology.json``; an in-process consumer (e.g. langchain) can cache
|
|
12
|
+
the :class:`Ontology` in memory. Fetch a fresh one over HTTP with
|
|
13
|
+
:meth:`diffbot.Diffbot.dql_fetch_ontology`.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import pathlib
|
|
18
|
+
import re
|
|
19
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Ontology:
|
|
23
|
+
"""Queryable view over a parsed Diffbot ontology document.
|
|
24
|
+
|
|
25
|
+
The instance holds the parsed document on :attr:`data` and exposes pure
|
|
26
|
+
lookup methods over it. Nothing here performs I/O — construct with already
|
|
27
|
+
parsed data, or use :meth:`from_json` / :meth:`from_path` for convenience.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, data: Dict[str, Any]):
|
|
31
|
+
self.data = data
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def from_json(cls, raw: Union[str, bytes]) -> "Ontology":
|
|
35
|
+
"""Build from a raw JSON string or bytes (e.g. an HTTP response body)."""
|
|
36
|
+
return cls(json.loads(raw))
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def from_path(cls, path: Union[str, pathlib.Path]) -> "Ontology":
|
|
40
|
+
"""Build from a JSON file on disk."""
|
|
41
|
+
return cls(json.loads(pathlib.Path(path).read_text()))
|
|
42
|
+
|
|
43
|
+
def types(self) -> List[str]:
|
|
44
|
+
"""All entity type names (e.g. ``Organization``, ``Person``)."""
|
|
45
|
+
return sorted(self.data.get("types", {}).keys())
|
|
46
|
+
|
|
47
|
+
def composites(self) -> List[str]:
|
|
48
|
+
"""All composite type names (e.g. ``Location``, ``Employment``)."""
|
|
49
|
+
return sorted(self.data.get("composites", {}).keys())
|
|
50
|
+
|
|
51
|
+
def enums(self) -> List[str]:
|
|
52
|
+
"""All enum type names (e.g. ``Language``, ``Gender``)."""
|
|
53
|
+
return sorted(self.data.get("enums", {}).keys())
|
|
54
|
+
|
|
55
|
+
def taxonomies(self) -> List[str]:
|
|
56
|
+
"""All taxonomy names (e.g. ``OrganizationCategory``)."""
|
|
57
|
+
return sorted(self.data.get("taxonomies", {}).keys())
|
|
58
|
+
|
|
59
|
+
@staticmethod
|
|
60
|
+
def _fields_of(container: Dict[str, Any], type_name: str) -> Dict[str, Any]:
|
|
61
|
+
entry = container.get(type_name)
|
|
62
|
+
if entry is None:
|
|
63
|
+
raise KeyError(f"Unknown name: {type_name}")
|
|
64
|
+
return entry.get("fields", {})
|
|
65
|
+
|
|
66
|
+
def fields_for(self, type_name: str) -> Dict[str, Any]:
|
|
67
|
+
"""Return the field map of an entity type or composite.
|
|
68
|
+
|
|
69
|
+
Auto-routes: ``type_name`` may be an entity type (``Organization``) or a
|
|
70
|
+
composite (``Location``). Raises ``KeyError`` if it is neither.
|
|
71
|
+
"""
|
|
72
|
+
types = self.data.get("types", {})
|
|
73
|
+
composites = self.data.get("composites", {})
|
|
74
|
+
if type_name in types:
|
|
75
|
+
return self._fields_of(types, type_name)
|
|
76
|
+
if type_name in composites:
|
|
77
|
+
return self._fields_of(composites, type_name)
|
|
78
|
+
raise KeyError(f"{type_name} is not a known entity type or composite")
|
|
79
|
+
|
|
80
|
+
@staticmethod
|
|
81
|
+
def filter_fields(
|
|
82
|
+
fields: Dict[str, Any],
|
|
83
|
+
search: Optional[str],
|
|
84
|
+
include_deprecated: bool = False,
|
|
85
|
+
) -> List[Tuple[str, Dict[str, Any]]]:
|
|
86
|
+
"""Filter a field map by a name regex, dropping deprecated by default."""
|
|
87
|
+
pattern = re.compile(search, re.IGNORECASE) if search else None
|
|
88
|
+
out = []
|
|
89
|
+
for name, meta in fields.items():
|
|
90
|
+
if not include_deprecated and meta.get("isDeprecated"):
|
|
91
|
+
continue
|
|
92
|
+
if pattern and not pattern.search(name):
|
|
93
|
+
continue
|
|
94
|
+
out.append((name, meta))
|
|
95
|
+
return out
|
|
96
|
+
|
|
97
|
+
def taxonomy_values(self, name: str, search: Optional[str] = None) -> List[str]:
|
|
98
|
+
"""Flatten a taxonomy's values (recursing into children), optionally filtered."""
|
|
99
|
+
tax = self.data.get("taxonomies", {}).get(name)
|
|
100
|
+
if tax is None:
|
|
101
|
+
raise KeyError(f"Unknown taxonomy: {name}")
|
|
102
|
+
pattern = re.compile(search, re.IGNORECASE) if search else None
|
|
103
|
+
out: List[str] = []
|
|
104
|
+
|
|
105
|
+
def walk(node: Dict[str, Any]) -> None:
|
|
106
|
+
n = node.get("name")
|
|
107
|
+
if n and (pattern is None or pattern.search(n)):
|
|
108
|
+
out.append(n)
|
|
109
|
+
for child in node.get("children", []) or []:
|
|
110
|
+
walk(child)
|
|
111
|
+
|
|
112
|
+
for cat in tax.get("categories", []) or []:
|
|
113
|
+
walk(cat)
|
|
114
|
+
return out
|
|
115
|
+
|
|
116
|
+
def enum_values(self, name: str) -> List[str]:
|
|
117
|
+
"""Return the allowed values of an enum."""
|
|
118
|
+
enum = self.data.get("enums", {}).get(name)
|
|
119
|
+
if enum is None:
|
|
120
|
+
raise KeyError(f"Unknown enum: {name}")
|
|
121
|
+
return list(enum.get("values", []))
|
|
122
|
+
|
|
123
|
+
def find_named(self, search: str) -> List[str]:
|
|
124
|
+
"""Fallback search: every ``name`` anywhere in the document matching a regex."""
|
|
125
|
+
pattern = re.compile(search, re.IGNORECASE)
|
|
126
|
+
found = set()
|
|
127
|
+
|
|
128
|
+
def walk(node: Any) -> None:
|
|
129
|
+
if isinstance(node, dict):
|
|
130
|
+
n = node.get("name")
|
|
131
|
+
if isinstance(n, str) and pattern.search(n):
|
|
132
|
+
found.add(n)
|
|
133
|
+
for v in node.values():
|
|
134
|
+
walk(v)
|
|
135
|
+
elif isinstance(node, list):
|
|
136
|
+
for v in node:
|
|
137
|
+
walk(v)
|
|
138
|
+
|
|
139
|
+
walk(self.data)
|
|
140
|
+
return sorted(found)
|
|
141
|
+
|
|
142
|
+
@staticmethod
|
|
143
|
+
def format_field(name: str, meta: Dict[str, Any]) -> str:
|
|
144
|
+
"""Render one field as ``<name>: [<type>] [flags...]`` for display."""
|
|
145
|
+
t = meta.get("type", "?")
|
|
146
|
+
if t == "LinkedEntity":
|
|
147
|
+
le = meta.get("leType") or []
|
|
148
|
+
if le:
|
|
149
|
+
t = f"LinkedEntity ({le[0]})"
|
|
150
|
+
flags = []
|
|
151
|
+
if meta.get("isList"):
|
|
152
|
+
flags.append("isList")
|
|
153
|
+
if meta.get("isComposite"):
|
|
154
|
+
flags.append("isComposite")
|
|
155
|
+
if meta.get("isEnum"):
|
|
156
|
+
flags.append("isEnum")
|
|
157
|
+
if meta.get("isDeprecated"):
|
|
158
|
+
flags.append("DEPRECATED")
|
|
159
|
+
suffix = "".join(f" [{f}]" for f in flags)
|
|
160
|
+
return f"{name}: [{t}]{suffix}"
|
|
@@ -18,7 +18,7 @@ def web_search(
|
|
|
18
18
|
headers = {"Authorization": f"Bearer {client.token}"}
|
|
19
19
|
params: Dict[str, Any] = {"text": text}
|
|
20
20
|
if num_results is not None:
|
|
21
|
-
params["
|
|
21
|
+
params["size"] = num_results
|
|
22
22
|
if max_tokens is not None:
|
|
23
23
|
params["maxTokens"] = max_tokens
|
|
24
24
|
response = client._http.get(client.web_search_url, headers=headers, params=params)
|
|
@@ -36,7 +36,7 @@ async def web_search_async(
|
|
|
36
36
|
headers = {"Authorization": f"Bearer {client.token}"}
|
|
37
37
|
params: Dict[str, Any] = {"text": text}
|
|
38
38
|
if num_results is not None:
|
|
39
|
-
params["
|
|
39
|
+
params["size"] = num_results
|
|
40
40
|
if max_tokens is not None:
|
|
41
41
|
params["maxTokens"] = max_tokens
|
|
42
42
|
response = await client._http.get(client.web_search_url, headers=headers, params=params)
|
|
@@ -1,15 +1,13 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
1
|
import pytest
|
|
4
2
|
|
|
5
|
-
from diffbot import Diffbot
|
|
3
|
+
from diffbot import Diffbot, resolve_token
|
|
6
4
|
|
|
7
5
|
|
|
8
6
|
@pytest.fixture(scope="session")
|
|
9
7
|
def live_token():
|
|
10
|
-
token =
|
|
8
|
+
token = resolve_token()
|
|
11
9
|
if not token:
|
|
12
|
-
pytest.skip("
|
|
10
|
+
pytest.skip("no Diffbot token found (set DIFFBOT_API_TOKEN or ~/.diffbot/credentials)")
|
|
13
11
|
return token
|
|
14
12
|
|
|
15
13
|
|
|
@@ -1,7 +1,15 @@
|
|
|
1
1
|
import httpx
|
|
2
2
|
import pytest
|
|
3
3
|
|
|
4
|
-
from diffbot import
|
|
4
|
+
from diffbot import (
|
|
5
|
+
APIError,
|
|
6
|
+
AuthError,
|
|
7
|
+
Diffbot,
|
|
8
|
+
ExtractionError,
|
|
9
|
+
RateLimitError,
|
|
10
|
+
ValidationError,
|
|
11
|
+
resolve_token,
|
|
12
|
+
)
|
|
5
13
|
|
|
6
14
|
|
|
7
15
|
"""
|
|
@@ -40,6 +48,36 @@ def test_token_required():
|
|
|
40
48
|
Diffbot(token="")
|
|
41
49
|
|
|
42
50
|
|
|
51
|
+
def test_resolve_token_explicit_wins(monkeypatch, tmp_path):
|
|
52
|
+
# An explicit token takes precedence over env var and file.
|
|
53
|
+
monkeypatch.setenv("DIFFBOT_API_TOKEN", "env-token")
|
|
54
|
+
assert resolve_token("explicit-token") == "explicit-token"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def test_resolve_token_from_env(monkeypatch, tmp_path):
|
|
58
|
+
# A token in the environment is returned when none is passed.
|
|
59
|
+
monkeypatch.setenv("DIFFBOT_API_TOKEN", "env-token")
|
|
60
|
+
monkeypatch.setattr("diffbot._auth.CREDENTIALS_PATH", tmp_path / "missing")
|
|
61
|
+
assert resolve_token() == "env-token"
|
|
62
|
+
# And can be used to build a client.
|
|
63
|
+
assert Diffbot(token=resolve_token()).token == "env-token"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_resolve_token_from_credentials_file(monkeypatch, tmp_path):
|
|
67
|
+
# Falls back to ~/.diffbot/credentials when no env var is set.
|
|
68
|
+
monkeypatch.delenv("DIFFBOT_API_TOKEN", raising=False)
|
|
69
|
+
creds = tmp_path / "credentials"
|
|
70
|
+
creds.write_text("DIFFBOT_API_TOKEN=file-token\n")
|
|
71
|
+
monkeypatch.setattr("diffbot._auth.CREDENTIALS_PATH", creds)
|
|
72
|
+
assert resolve_token() == "file-token"
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def test_resolve_token_missing_returns_empty(monkeypatch, tmp_path):
|
|
76
|
+
monkeypatch.delenv("DIFFBOT_API_TOKEN", raising=False)
|
|
77
|
+
monkeypatch.setattr("diffbot._auth.CREDENTIALS_PATH", tmp_path / "missing")
|
|
78
|
+
assert resolve_token() == ""
|
|
79
|
+
|
|
80
|
+
|
|
43
81
|
def test_user_agent_header():
|
|
44
82
|
captured = {}
|
|
45
83
|
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
import httpx
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from diffbot import Diffbot, DiffbotAsync, Ontology
|
|
7
|
+
|
|
8
|
+
FIXTURE_ONTOLOGY = {
|
|
9
|
+
"types": {
|
|
10
|
+
"Organization": {
|
|
11
|
+
"fields": {
|
|
12
|
+
"name": {"type": "String"},
|
|
13
|
+
"location": {"type": "Location", "isComposite": True},
|
|
14
|
+
"oldField": {"type": "String", "isDeprecated": True},
|
|
15
|
+
}
|
|
16
|
+
},
|
|
17
|
+
"Person": {"fields": {"name": {"type": "String"}}},
|
|
18
|
+
},
|
|
19
|
+
"composites": {
|
|
20
|
+
"Location": {"fields": {"city": {"type": "City", "isComposite": True}}},
|
|
21
|
+
},
|
|
22
|
+
"enums": {"Language": {"values": ["EN", "FR", "DE"]}},
|
|
23
|
+
"taxonomies": {
|
|
24
|
+
"OrganizationCategory": {
|
|
25
|
+
"categories": [
|
|
26
|
+
{"name": "Technology", "children": [{"name": "Semiconductor Companies"}]},
|
|
27
|
+
]
|
|
28
|
+
}
|
|
29
|
+
},
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@pytest.fixture
|
|
34
|
+
def ont() -> Ontology:
|
|
35
|
+
return Ontology(FIXTURE_ONTOLOGY)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_navigation_helpers(ont):
|
|
39
|
+
assert ont.types() == ["Organization", "Person"]
|
|
40
|
+
assert ont.composites() == ["Location"]
|
|
41
|
+
assert ont.enums() == ["Language"]
|
|
42
|
+
assert ont.taxonomies() == ["OrganizationCategory"]
|
|
43
|
+
assert ont.enum_values("Language") == ["EN", "FR", "DE"]
|
|
44
|
+
assert ont.taxonomy_values("OrganizationCategory", "semi") == ["Semiconductor Companies"]
|
|
45
|
+
assert ont.find_named("compan") == ["Semiconductor Companies"]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_fields_for_routes_types_and_composites(ont):
|
|
49
|
+
assert "name" in ont.fields_for("Organization")
|
|
50
|
+
assert "city" in ont.fields_for("Location")
|
|
51
|
+
with pytest.raises(KeyError):
|
|
52
|
+
ont.fields_for("NopeType")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_filter_fields_drops_deprecated_by_default(ont):
|
|
56
|
+
fields = ont.fields_for("Organization")
|
|
57
|
+
names = [n for n, _ in Ontology.filter_fields(fields, None)]
|
|
58
|
+
assert "oldField" not in names
|
|
59
|
+
names_incl = [n for n, _ in Ontology.filter_fields(fields, None, include_deprecated=True)]
|
|
60
|
+
assert "oldField" in names_incl
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def test_format_field(ont):
|
|
64
|
+
fields = ont.fields_for("Organization")
|
|
65
|
+
assert Ontology.format_field("location", fields["location"]) == "location: [Location] [isComposite]"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def test_from_json_and_from_path(tmp_path):
|
|
69
|
+
raw = json.dumps(FIXTURE_ONTOLOGY)
|
|
70
|
+
assert Ontology.from_json(raw).types() == ["Organization", "Person"]
|
|
71
|
+
path = tmp_path / "ontology.json"
|
|
72
|
+
path.write_text(raw)
|
|
73
|
+
assert Ontology.from_path(path).enums() == ["Language"]
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def test_unknown_taxonomy_and_enum_raise(ont):
|
|
77
|
+
with pytest.raises(KeyError):
|
|
78
|
+
ont.taxonomy_values("Nope")
|
|
79
|
+
with pytest.raises(KeyError):
|
|
80
|
+
ont.enum_values("Nope")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def test_dql_fetch_ontology_returns_ontology():
|
|
84
|
+
def handler(request: httpx.Request) -> httpx.Response:
|
|
85
|
+
assert request.url.path.endswith("/ontology")
|
|
86
|
+
return httpx.Response(200, json=FIXTURE_ONTOLOGY)
|
|
87
|
+
|
|
88
|
+
db = Diffbot(token="test-token", transport=httpx.MockTransport(handler))
|
|
89
|
+
ont = db.dql_fetch_ontology()
|
|
90
|
+
assert isinstance(ont, Ontology)
|
|
91
|
+
assert ont.types() == ["Organization", "Person"]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@pytest.mark.anyio
|
|
95
|
+
async def test_async_dql_fetch_ontology_returns_ontology():
|
|
96
|
+
def handler(request: httpx.Request) -> httpx.Response:
|
|
97
|
+
return httpx.Response(200, json=FIXTURE_ONTOLOGY)
|
|
98
|
+
|
|
99
|
+
db = DiffbotAsync(token="test-token", transport=httpx.MockTransport(handler))
|
|
100
|
+
ont = await db.dql_fetch_ontology()
|
|
101
|
+
assert isinstance(ont, Ontology)
|
|
102
|
+
assert ont.composites() == ["Location"]
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@pytest.mark.anyio
|
|
106
|
+
async def test_async_dql_parallel_runs_all_queries():
|
|
107
|
+
def handler(request: httpx.Request) -> httpx.Response:
|
|
108
|
+
q = request.url.params["query"]
|
|
109
|
+
hits = 5 if "Diffbot" in q else 100
|
|
110
|
+
return httpx.Response(200, json={"hits": hits, "results": 0})
|
|
111
|
+
|
|
112
|
+
db = DiffbotAsync(token="test-token", transport=httpx.MockTransport(handler))
|
|
113
|
+
results = await db.dql_parallel(
|
|
114
|
+
[
|
|
115
|
+
{"query": 'type:Organization name:"Diffbot"', "size": 0},
|
|
116
|
+
{"query": "type:Organization", "size": 0},
|
|
117
|
+
]
|
|
118
|
+
)
|
|
119
|
+
assert [r["hits"] for r in results] == [5, 100]
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import httpx
|
|
3
3
|
import pytest
|
|
4
4
|
|
|
5
|
-
from diffbot import CrawlEventType, Diffbot, DiffbotAsync
|
|
5
|
+
from diffbot import CrawlEventType, Diffbot, DiffbotAsync, resolve_token
|
|
6
6
|
|
|
7
7
|
SSE_PARIS = 'data: {"choices": [{"delta": {"content": "Paris"}}]}\n'
|
|
8
8
|
|
|
@@ -173,3 +173,21 @@ async def test_readme_async_entities():
|
|
|
173
173
|
assert len(result["entities"]) == 2
|
|
174
174
|
assert result["entities"][0]["name"] == "Apple"
|
|
175
175
|
assert result["sentiment"] == 0.3
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# ---------------------------------------------------------------------------
|
|
179
|
+
# Authentication
|
|
180
|
+
# ---------------------------------------------------------------------------
|
|
181
|
+
|
|
182
|
+
def test_readme_authentication_resolve_token(monkeypatch, tmp_path):
|
|
183
|
+
# README "Authentication": Diffbot(token=resolve_token()) using the env var.
|
|
184
|
+
monkeypatch.setenv("DIFFBOT_API_TOKEN", "test-token")
|
|
185
|
+
monkeypatch.setattr("diffbot._auth.CREDENTIALS_PATH", tmp_path / "missing")
|
|
186
|
+
|
|
187
|
+
def handler(request: httpx.Request) -> httpx.Response:
|
|
188
|
+
assert request.url.params["token"] == "test-token"
|
|
189
|
+
return httpx.Response(200, json={"objects": [{"title": "Example"}]})
|
|
190
|
+
|
|
191
|
+
db = Diffbot(token=resolve_token(), transport=httpx.MockTransport(handler))
|
|
192
|
+
data = db.extract("https://www.example.com")
|
|
193
|
+
assert "objects" in data
|
diffbot_python-0.1.0/AGENTS.md
DELETED
|
@@ -1,5 +0,0 @@
|
|
|
1
|
-
# Agent Guidelines
|
|
2
|
-
|
|
3
|
-
## README Examples
|
|
4
|
-
|
|
5
|
-
Whenever a code example in `README.md` is added or updated, the corresponding test must be added or updated in `tests/test_readme_examples.py`. Run `python -m pytest tests/test_readme_examples.py` to validate before considering the work complete.
|
diffbot_python-0.1.0/CLAUDE.md
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
AGENTS.md
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import pathlib
|
|
3
|
-
|
|
4
|
-
import click
|
|
5
|
-
|
|
6
|
-
from diffbot import Diffbot
|
|
7
|
-
|
|
8
|
-
CREDENTIALS_PATH = pathlib.Path.home() / ".diffbot" / "credentials"
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def resolve_token() -> str:
|
|
12
|
-
"""Return the Diffbot API token from the env var, falling back to ~/.diffbot/credentials."""
|
|
13
|
-
token = os.environ.get("DIFFBOT_API_TOKEN", "").strip()
|
|
14
|
-
if token:
|
|
15
|
-
return token
|
|
16
|
-
|
|
17
|
-
if CREDENTIALS_PATH.exists():
|
|
18
|
-
for line in CREDENTIALS_PATH.read_text().splitlines():
|
|
19
|
-
line = line.strip()
|
|
20
|
-
if line.startswith("DIFFBOT_API_TOKEN="):
|
|
21
|
-
return line[len("DIFFBOT_API_TOKEN="):].strip()
|
|
22
|
-
|
|
23
|
-
return ""
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def get_client() -> Diffbot:
|
|
27
|
-
token = resolve_token()
|
|
28
|
-
if not token:
|
|
29
|
-
click.echo(
|
|
30
|
-
"Error: no Diffbot API token found.\n"
|
|
31
|
-
" Set a DIFFBOT_API_TOKEN environment variable, or\n"
|
|
32
|
-
f" write 'DIFFBOT_API_TOKEN=YOUR_TOKEN' to {CREDENTIALS_PATH}",
|
|
33
|
-
err=True,
|
|
34
|
-
)
|
|
35
|
-
raise click.Abort()
|
|
36
|
-
return Diffbot(token=token)
|
|
@@ -1,130 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import pathlib
|
|
3
|
-
import re
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
5
|
-
|
|
6
|
-
ONTOLOGY_PATH = pathlib.Path.home() / ".diffbot" / "ontology.json"
|
|
7
|
-
|
|
8
|
-
_CACHE: Dict[str, Any] = {}
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def load() -> Dict[str, Any]:
|
|
12
|
-
if "data" not in _CACHE:
|
|
13
|
-
if not ONTOLOGY_PATH.exists():
|
|
14
|
-
raise FileNotFoundError(
|
|
15
|
-
f"Ontology not found at {ONTOLOGY_PATH}. Run: db dql init"
|
|
16
|
-
)
|
|
17
|
-
_CACHE["data"] = json.loads(ONTOLOGY_PATH.read_text())
|
|
18
|
-
return _CACHE["data"]
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def list_types() -> List[str]:
|
|
22
|
-
return sorted(load().get("types", {}).keys())
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def list_composites() -> List[str]:
|
|
26
|
-
return sorted(load().get("composites", {}).keys())
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def list_enums() -> List[str]:
|
|
30
|
-
return sorted(load().get("enums", {}).keys())
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def list_taxonomies() -> List[str]:
|
|
34
|
-
return sorted(load().get("taxonomies", {}).keys())
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def _fields_of(container: Dict[str, Any], type_name: str) -> Dict[str, Any]:
|
|
38
|
-
entry = container.get(type_name)
|
|
39
|
-
if entry is None:
|
|
40
|
-
raise KeyError(f"Unknown name: {type_name}")
|
|
41
|
-
return entry.get("fields", {})
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def fields_for(type_name: str) -> Dict[str, Any]:
|
|
45
|
-
data = load()
|
|
46
|
-
types = data.get("types", {})
|
|
47
|
-
composites = data.get("composites", {})
|
|
48
|
-
if type_name in types:
|
|
49
|
-
return _fields_of(types, type_name)
|
|
50
|
-
if type_name in composites:
|
|
51
|
-
return _fields_of(composites, type_name)
|
|
52
|
-
raise KeyError(f"{type_name} is not a known entity type or composite")
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
def format_field(name: str, meta: Dict[str, Any]) -> str:
|
|
56
|
-
t = meta.get("type", "?")
|
|
57
|
-
if t == "LinkedEntity":
|
|
58
|
-
le = meta.get("leType") or []
|
|
59
|
-
if le:
|
|
60
|
-
t = f"LinkedEntity ({le[0]})"
|
|
61
|
-
flags = []
|
|
62
|
-
if meta.get("isList"):
|
|
63
|
-
flags.append("isList")
|
|
64
|
-
if meta.get("isComposite"):
|
|
65
|
-
flags.append("isComposite")
|
|
66
|
-
if meta.get("isEnum"):
|
|
67
|
-
flags.append("isEnum")
|
|
68
|
-
if meta.get("isDeprecated"):
|
|
69
|
-
flags.append("DEPRECATED")
|
|
70
|
-
suffix = "".join(f" [{f}]" for f in flags)
|
|
71
|
-
return f"{name}: [{t}]{suffix}"
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def filter_fields(fields: Dict[str, Any], search: Optional[str], include_deprecated: bool = False) -> List[tuple]:
|
|
75
|
-
pattern = re.compile(search, re.IGNORECASE) if search else None
|
|
76
|
-
out = []
|
|
77
|
-
for name, meta in fields.items():
|
|
78
|
-
if not include_deprecated and meta.get("isDeprecated"):
|
|
79
|
-
continue
|
|
80
|
-
if pattern and not pattern.search(name):
|
|
81
|
-
continue
|
|
82
|
-
out.append((name, meta))
|
|
83
|
-
return out
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
def taxonomy_values(name: str, search: Optional[str] = None) -> List[str]:
|
|
87
|
-
data = load()
|
|
88
|
-
tax = data.get("taxonomies", {}).get(name)
|
|
89
|
-
if tax is None:
|
|
90
|
-
raise KeyError(f"Unknown taxonomy: {name}")
|
|
91
|
-
pattern = re.compile(search, re.IGNORECASE) if search else None
|
|
92
|
-
out: List[str] = []
|
|
93
|
-
|
|
94
|
-
def walk(node: Dict[str, Any]) -> None:
|
|
95
|
-
n = node.get("name")
|
|
96
|
-
if n and (pattern is None or pattern.search(n)):
|
|
97
|
-
out.append(n)
|
|
98
|
-
for child in node.get("children", []) or []:
|
|
99
|
-
walk(child)
|
|
100
|
-
|
|
101
|
-
for cat in tax.get("categories", []) or []:
|
|
102
|
-
walk(cat)
|
|
103
|
-
return out
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
def enum_values(name: str) -> List[str]:
|
|
107
|
-
data = load()
|
|
108
|
-
enum = data.get("enums", {}).get(name)
|
|
109
|
-
if enum is None:
|
|
110
|
-
raise KeyError(f"Unknown enum: {name}")
|
|
111
|
-
return list(enum.get("values", []))
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
def find_named(search: str) -> List[str]:
|
|
115
|
-
pattern = re.compile(search, re.IGNORECASE)
|
|
116
|
-
found = set()
|
|
117
|
-
|
|
118
|
-
def walk(node: Any) -> None:
|
|
119
|
-
if isinstance(node, dict):
|
|
120
|
-
n = node.get("name")
|
|
121
|
-
if isinstance(n, str) and pattern.search(n):
|
|
122
|
-
found.add(n)
|
|
123
|
-
for v in node.values():
|
|
124
|
-
walk(v)
|
|
125
|
-
elif isinstance(node, list):
|
|
126
|
-
for v in node:
|
|
127
|
-
walk(v)
|
|
128
|
-
|
|
129
|
-
walk(load())
|
|
130
|
-
return sorted(found)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|