diffbot-python 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/.gitignore +3 -0
  2. diffbot_python-0.1.0/README.md → diffbot_python-0.2.0/PKG-INFO +76 -5
  3. diffbot_python-0.1.0/PKG-INFO → diffbot_python-0.2.0/README.md +41 -28
  4. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/pyproject.toml +40 -10
  5. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/__init__.py +4 -0
  6. diffbot_python-0.2.0/src/diffbot/_auth.py +41 -0
  7. diffbot_python-0.2.0/src/diffbot/cli/_common.py +21 -0
  8. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/cli/dql.py +3 -1
  9. diffbot_python-0.2.0/src/diffbot/cli/ontology.py +74 -0
  10. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/client.py +18 -2
  11. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/kg.py +38 -0
  12. diffbot_python-0.2.0/src/diffbot/ontology.py +160 -0
  13. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/web_search.py +2 -2
  14. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/tests/conftest.py +3 -5
  15. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/tests/test_extract.py +39 -1
  16. diffbot_python-0.2.0/tests/test_ontology.py +119 -0
  17. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/tests/test_readme_examples.py +19 -1
  18. diffbot_python-0.1.0/AGENTS.md +0 -5
  19. diffbot_python-0.1.0/CLAUDE.md +0 -1
  20. diffbot_python-0.1.0/src/diffbot/cli/_common.py +0 -36
  21. diffbot_python-0.1.0/src/diffbot/cli/ontology.py +0 -130
  22. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/LICENSE +0 -0
  23. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/ask.py +0 -0
  24. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/cli/__init__.py +0 -0
  25. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/cli/__main__.py +0 -0
  26. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/cli/entities.py +0 -0
  27. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/crawl.py +0 -0
  28. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/errors.py +0 -0
  29. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/extract.py +0 -0
  30. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/src/diffbot/nlp.py +0 -0
  31. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/tests/test_ask.py +0 -0
  32. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/tests/test_async.py +0 -0
  33. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/tests/test_crawl.py +0 -0
  34. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/tests/test_dql.py +0 -0
  35. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/tests/test_dql_cli.py +0 -0
  36. {diffbot_python-0.1.0 → diffbot_python-0.2.0}/tests/test_web_search.py +0 -0
@@ -3,6 +3,9 @@
3
3
  __pycache__
4
4
  .pytest_cache
5
5
  .env
6
+ dist/
7
+ build/
8
+ *.egg-info/
6
9
 
7
10
  # Claude
8
11
  .claude/settings.local.json
@@ -1,3 +1,38 @@
1
+ Metadata-Version: 2.4
2
+ Name: diffbot-python
3
+ Version: 0.2.0
4
+ Summary: Python client library for Diffbot APIs
5
+ Project-URL: Homepage, https://github.com/diffbot/diffbot-python
6
+ Project-URL: Documentation, https://github.com/diffbot/diffbot-python#readme
7
+ Project-URL: Repository, https://github.com/diffbot/diffbot-python
8
+ Project-URL: Issues, https://github.com/diffbot/diffbot-python/issues
9
+ Author-email: Jerome Choo <jerome@diffbot.com>, Mike Tung <miket@diffbot.com>
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: api-client,crawler,diffbot,extract,knowledge-graph,llm,nlp,web-scraping
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3 :: Only
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Internet :: WWW/HTTP
23
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
24
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
25
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
26
+ Classifier: Topic :: Text Processing :: Markup :: HTML
27
+ Classifier: Typing :: Typed
28
+ Requires-Python: >=3.10
29
+ Requires-Dist: click>=8.1.0
30
+ Requires-Dist: httpx>=0.27.0
31
+ Requires-Dist: rich>=13.0.0
32
+ Provides-Extra: dev
33
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
34
+ Description-Content-Type: text/markdown
35
+
1
36
  # Diffbot Python Library
2
37
 
3
38
  Python client library for [Diffbot](https://www.diffbot.com) APIs.
@@ -6,7 +41,7 @@ Python client library for [Diffbot](https://www.diffbot.com) APIs.
6
41
  ## Installation
7
42
 
8
43
  ```bash
9
- pip install git+https://github.com/diffbot/diffbot-python.git
44
+ python3 -m pip install diffbot-python
10
45
  ```
11
46
 
12
47
  Or, for local development:
@@ -18,12 +53,38 @@ pip install -e ".[dev]"
18
53
  ## Usage
19
54
 
20
55
  ### Authentication
21
- Set your Diffbot API token in your environment or .env.
56
+
57
+ The CLI and the library can share a single credential. The token always has to be
58
+ passed to the client explicitly, but `resolve_token()` gives you the same lookup the
59
+ CLI uses, in this order:
60
+
61
+ 1. An explicit token passed to `resolve_token(token)`.
62
+ 2. The `DIFFBOT_API_TOKEN` environment variable.
63
+ 3. A `DIFFBOT_API_TOKEN=...` line in `~/.diffbot/credentials`.
64
+
65
+ Set it once and it works for both the CLI and your scripts. Either export it:
22
66
 
23
67
  ```bash
24
68
  export DIFFBOT_API_TOKEN=<TOKEN>
25
69
  ```
26
70
 
71
+ …or write it to the shared credentials file (handy for keeping it out of your shell environment):
72
+
73
+ ```bash
74
+ mkdir -p ~/.diffbot
75
+ printf 'DIFFBOT_API_TOKEN=%s\n' '<TOKEN>' > ~/.diffbot/credentials
76
+ chmod 600 ~/.diffbot/credentials
77
+ ```
78
+
79
+ With either in place, resolve the token and pass it to the client:
80
+
81
+ ```python
82
+ from diffbot import Diffbot, resolve_token
83
+
84
+ db = Diffbot(token=resolve_token()) # from env var or ~/.diffbot/credentials
85
+ data = db.extract("https://www.example.com")
86
+ ```
87
+
27
88
  ### Extract structured content
28
89
  ```python
29
90
  from diffbot import Diffbot
@@ -166,7 +227,15 @@ asyncio.run(main())
166
227
 
167
228
  ## CLI
168
229
 
169
- This library also includes a CLI.
230
+ This library also includes a CLI exposed as the `db` command.
231
+
232
+ To make `db` available from anywhere, install it as an isolated tool with [uv](https://docs.astral.sh/uv/):
233
+
234
+ ```bash
235
+ uv tool install .
236
+ ```
237
+
238
+ This drops a `db` executable into `~/.local/bin` (ensure it is on your `PATH`). Use `--force` to reinstall or upgrade after changes, or `--editable` to have source edits take effect immediately. Alternatively, a plain `pip install .` (or `pip install -e .`) also installs the `db` entry point into the active environment.
170
239
 
171
240
  ```bash
172
241
  export DIFFBOT_API_TOKEN=your-token-here
@@ -189,7 +258,9 @@ Run the mock test suite:
189
258
  python -m pytest
190
259
  ```
191
260
 
192
- Run live integration tests against the real API (requires a valid token):
261
+ Run live integration tests against the real API (requires a valid token).
262
+ The token is resolved the same way as everywhere else — the `DIFFBOT_API_TOKEN`
263
+ environment variable or `~/.diffbot/credentials`:
193
264
  ```bash
194
- DIFFBOT_TOKEN=your_token python -m pytest -m live
265
+ DIFFBOT_API_TOKEN=your_token python -m pytest -m live
195
266
  ```
@@ -1,26 +1,3 @@
1
- Metadata-Version: 2.4
2
- Name: diffbot-python
3
- Version: 0.1.0
4
- Summary: Python client library for Diffbot APIs
5
- Project-URL: Homepage, https://github.com/diffbot/diffbot-python
6
- Project-URL: Repository, https://github.com/diffbot/diffbot-python
7
- Project-URL: Issues, https://github.com/diffbot/diffbot-python/issues
8
- Author-email: Jerome Choo <jerome@diffbot.com>, Mike Tung <miket@diffbot.com>
9
- License-Expression: MIT
10
- License-File: LICENSE
11
- Classifier: Operating System :: OS Independent
12
- Classifier: Programming Language :: Python :: 3
13
- Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
14
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
- Classifier: Topic :: Software Development :: Libraries
16
- Requires-Python: >=3.10
17
- Requires-Dist: click>=8.1.0
18
- Requires-Dist: httpx>=0.27.0
19
- Requires-Dist: rich>=13.0.0
20
- Provides-Extra: dev
21
- Requires-Dist: pytest>=8.0.0; extra == 'dev'
22
- Description-Content-Type: text/markdown
23
-
24
1
  # Diffbot Python Library
25
2
 
26
3
  Python client library for [Diffbot](https://www.diffbot.com) APIs.
@@ -29,7 +6,7 @@ Python client library for [Diffbot](https://www.diffbot.com) APIs.
29
6
  ## Installation
30
7
 
31
8
  ```bash
32
- pip install git+https://github.com/diffbot/diffbot-python.git
9
+ python3 -m pip install diffbot-python
33
10
  ```
34
11
 
35
12
  Or, for local development:
@@ -41,12 +18,38 @@ pip install -e ".[dev]"
41
18
  ## Usage
42
19
 
43
20
  ### Authentication
44
- Set your Diffbot API token in your environment or .env.
21
+
22
+ The CLI and the library can share a single credential. The token always has to be
23
+ passed to the client explicitly, but `resolve_token()` gives you the same lookup the
24
+ CLI uses, in this order:
25
+
26
+ 1. An explicit token passed to `resolve_token(token)`.
27
+ 2. The `DIFFBOT_API_TOKEN` environment variable.
28
+ 3. A `DIFFBOT_API_TOKEN=...` line in `~/.diffbot/credentials`.
29
+
30
+ Set it once and it works for both the CLI and your scripts. Either export it:
45
31
 
46
32
  ```bash
47
33
  export DIFFBOT_API_TOKEN=<TOKEN>
48
34
  ```
49
35
 
36
+ …or write it to the shared credentials file (handy for keeping it out of your shell environment):
37
+
38
+ ```bash
39
+ mkdir -p ~/.diffbot
40
+ printf 'DIFFBOT_API_TOKEN=%s\n' '<TOKEN>' > ~/.diffbot/credentials
41
+ chmod 600 ~/.diffbot/credentials
42
+ ```
43
+
44
+ With either in place, resolve the token and pass it to the client:
45
+
46
+ ```python
47
+ from diffbot import Diffbot, resolve_token
48
+
49
+ db = Diffbot(token=resolve_token()) # from env var or ~/.diffbot/credentials
50
+ data = db.extract("https://www.example.com")
51
+ ```
52
+
50
53
  ### Extract structured content
51
54
  ```python
52
55
  from diffbot import Diffbot
@@ -189,7 +192,15 @@ asyncio.run(main())
189
192
 
190
193
  ## CLI
191
194
 
192
- This library also includes a CLI.
195
+ This library also includes a CLI exposed as the `db` command.
196
+
197
+ To make `db` available from anywhere, install it as an isolated tool with [uv](https://docs.astral.sh/uv/):
198
+
199
+ ```bash
200
+ uv tool install .
201
+ ```
202
+
203
+ This drops a `db` executable into `~/.local/bin` (ensure it is on your `PATH`). Use `--force` to reinstall or upgrade after changes, or `--editable` to have source edits take effect immediately. Alternatively, a plain `pip install .` (or `pip install -e .`) also installs the `db` entry point into the active environment.
193
204
 
194
205
  ```bash
195
206
  export DIFFBOT_API_TOKEN=your-token-here
@@ -212,7 +223,9 @@ Run the mock test suite:
212
223
  python -m pytest
213
224
  ```
214
225
 
215
- Run live integration tests against the real API (requires a valid token):
226
+ Run live integration tests against the real API (requires a valid token).
227
+ The token is resolved the same way as everywhere else — the `DIFFBOT_API_TOKEN`
228
+ environment variable or `~/.diffbot/credentials`:
216
229
  ```bash
217
- DIFFBOT_TOKEN=your_token python -m pytest -m live
230
+ DIFFBOT_API_TOKEN=your_token python -m pytest -m live
218
231
  ```
@@ -4,23 +4,43 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "diffbot-python"
7
- version = "0.1.0"
7
+ version = "0.2.0"
8
8
  description = "Python client library for Diffbot APIs"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
11
- classifiers = [
12
- "Programming Language :: Python :: 3",
13
- "Operating System :: OS Independent",
14
- "Topic :: Software Development :: Libraries",
15
- "Topic :: Scientific/Engineering :: Artificial Intelligence",
16
- "Topic :: Internet :: WWW/HTTP :: Indexing/Search"
17
- ]
18
11
  license = "MIT"
19
12
  license-files = ["LICEN[CS]E*"]
20
13
  authors = [
21
14
  { name = "Jerome Choo", email = "jerome@diffbot.com" },
22
15
  { name = "Mike Tung", email = "miket@diffbot.com" }
23
16
  ]
17
+ keywords = [
18
+ "diffbot",
19
+ "knowledge-graph",
20
+ "web-scraping",
21
+ "extract",
22
+ "crawler",
23
+ "nlp",
24
+ "llm",
25
+ "api-client",
26
+ ]
27
+ classifiers = [
28
+ "Development Status :: 3 - Alpha",
29
+ "Intended Audience :: Developers",
30
+ "Operating System :: OS Independent",
31
+ "Programming Language :: Python :: 3",
32
+ "Programming Language :: Python :: 3 :: Only",
33
+ "Programming Language :: Python :: 3.10",
34
+ "Programming Language :: Python :: 3.11",
35
+ "Programming Language :: Python :: 3.12",
36
+ "Programming Language :: Python :: 3.13",
37
+ "Topic :: Internet :: WWW/HTTP",
38
+ "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
39
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
40
+ "Topic :: Software Development :: Libraries :: Python Modules",
41
+ "Topic :: Text Processing :: Markup :: HTML",
42
+ "Typing :: Typed",
43
+ ]
24
44
  dependencies = [
25
45
  "httpx>=0.27.0",
26
46
  "click>=8.1.0",
@@ -34,6 +54,7 @@ dev = [
34
54
 
35
55
  [project.urls]
36
56
  Homepage = "https://github.com/diffbot/diffbot-python"
57
+ Documentation = "https://github.com/diffbot/diffbot-python#readme"
37
58
  Repository = "https://github.com/diffbot/diffbot-python"
38
59
  Issues = "https://github.com/diffbot/diffbot-python/issues"
39
60
 
@@ -43,6 +64,15 @@ db = "diffbot.cli:main"
43
64
  [tool.hatch.build.targets.wheel]
44
65
  packages = ["src/diffbot"]
45
66
 
67
+ [tool.hatch.build.targets.sdist]
68
+ include = [
69
+ "/src",
70
+ "/tests",
71
+ "/README.md",
72
+ "/LICENSE",
73
+ "/pyproject.toml",
74
+ ]
75
+
46
76
  [tool.pytest.ini_options]
47
- markers = ["live: marks tests as live integration tests requiring a real DIFFBOT_TOKEN"]
48
- addopts = "-m 'not live'"
77
+ markers = ["live: marks tests as live integration tests requiring a real DIFFBOT_API_TOKEN"]
78
+ addopts = "-m 'not live'"
@@ -4,6 +4,7 @@ diffbot - Python client library for the Diffbot APIs.
4
4
 
5
5
  __version__ = "0.1.0"
6
6
 
7
+ from ._auth import resolve_token
7
8
  from .client import Diffbot, DiffbotAsync
8
9
  from .crawl import CrawlEvent, CrawlEventType
9
10
  from .errors import (
@@ -14,12 +15,15 @@ from .errors import (
14
15
  RateLimitError,
15
16
  ValidationError,
16
17
  )
18
+ from .ontology import Ontology
17
19
 
18
20
  __all__ = [
19
21
  "Diffbot",
20
22
  "DiffbotAsync",
23
+ "resolve_token",
21
24
  "CrawlEvent",
22
25
  "CrawlEventType",
26
+ "Ontology",
23
27
  "DiffbotError",
24
28
  "AuthError",
25
29
  "ExtractionError",
@@ -0,0 +1,41 @@
1
+ """Shared Diffbot credential resolution for both the library and the CLI.
2
+
3
+ The same lookup chain is used everywhere so a single credential works for the
4
+ ``db`` CLI and any Python script that constructs a client:
5
+
6
+ 1. An explicit token passed to the client / function.
7
+ 2. The ``DIFFBOT_API_TOKEN`` environment variable.
8
+ 3. A ``DIFFBOT_API_TOKEN=...`` line in ``~/.diffbot/credentials``.
9
+ """
10
+
11
+ import os
12
+ import pathlib
13
+ from typing import Optional
14
+
15
+ TOKEN_ENV_VAR = "DIFFBOT_API_TOKEN"
16
+ CREDENTIALS_PATH = pathlib.Path.home() / ".diffbot" / "credentials"
17
+
18
+
19
+ def _read_credentials_file() -> str:
20
+ if not CREDENTIALS_PATH.exists():
21
+ return ""
22
+ for line in CREDENTIALS_PATH.read_text().splitlines():
23
+ line = line.strip()
24
+ if line.startswith(f"{TOKEN_ENV_VAR}="):
25
+ return line[len(TOKEN_ENV_VAR) + 1:].strip()
26
+ return ""
27
+
28
+
29
+ def resolve_token(token: Optional[str] = None) -> str:
30
+ """Resolve a Diffbot API token from the explicit argument, env var, or file.
31
+
32
+ Returns an empty string if no token can be found.
33
+ """
34
+ if token and token.strip():
35
+ return token.strip()
36
+
37
+ env_token = os.environ.get(TOKEN_ENV_VAR, "").strip()
38
+ if env_token:
39
+ return env_token
40
+
41
+ return _read_credentials_file()
@@ -0,0 +1,21 @@
1
+ import click
2
+
3
+ from diffbot import Diffbot, resolve_token
4
+ from diffbot._auth import CREDENTIALS_PATH, TOKEN_ENV_VAR
5
+
6
+
7
+ def get_client() -> Diffbot:
8
+ """Build a Diffbot client using the shared credential resolution chain.
9
+
10
+ Looks at the DIFFBOT_API_TOKEN env var, then ~/.diffbot/credentials.
11
+ """
12
+ token = resolve_token()
13
+ if not token:
14
+ click.echo(
15
+ "Error: no Diffbot API token found.\n"
16
+ f" Set a {TOKEN_ENV_VAR} environment variable, or\n"
17
+ f" write '{TOKEN_ENV_VAR}=YOUR_TOKEN' to {CREDENTIALS_PATH}",
18
+ err=True,
19
+ )
20
+ raise click.Abort()
21
+ return Diffbot(token=token)
@@ -15,7 +15,9 @@ from rich.table import Table
15
15
  from diffbot import DiffbotError
16
16
 
17
17
  from . import ontology
18
- from ._common import get_client, resolve_token
18
+ from diffbot import resolve_token
19
+
20
+ from ._common import get_client
19
21
 
20
22
 
21
23
  class _DqlGroup(click.Group):
@@ -0,0 +1,74 @@
1
+ """CLI-side ontology access: a disk cache over the storage-agnostic core.
2
+
3
+ The navigation logic lives in :mod:`diffbot.ontology` (the `Ontology` class).
4
+ This module adds the CLI's caching policy on top: the ontology is read once from
5
+ ``~/.diffbot/ontology.json`` (populated by `db dql init`) and held in
6
+ ``_CACHE``. The module-level functions preserve the historical CLI surface and
7
+ simply delegate to an `Ontology` built from the cached document.
8
+ """
9
+
10
+ import json
11
+ import pathlib
12
+ from typing import Any, Dict, List, Optional, Tuple
13
+
14
+ from diffbot.ontology import Ontology
15
+
16
+ ONTOLOGY_PATH = pathlib.Path.home() / ".diffbot" / "ontology.json"
17
+
18
+ _CACHE: Dict[str, Any] = {}
19
+
20
+
21
+ def _data() -> Dict[str, Any]:
22
+ if "data" not in _CACHE:
23
+ if not ONTOLOGY_PATH.exists():
24
+ raise FileNotFoundError(
25
+ f"Ontology not found at {ONTOLOGY_PATH}. Run: db dql init"
26
+ )
27
+ _CACHE["data"] = json.loads(ONTOLOGY_PATH.read_text())
28
+ return _CACHE["data"]
29
+
30
+
31
+ def _ontology() -> Ontology:
32
+ return Ontology(_data())
33
+
34
+
35
+ def list_types() -> List[str]:
36
+ return _ontology().types()
37
+
38
+
39
+ def list_composites() -> List[str]:
40
+ return _ontology().composites()
41
+
42
+
43
+ def list_enums() -> List[str]:
44
+ return _ontology().enums()
45
+
46
+
47
+ def list_taxonomies() -> List[str]:
48
+ return _ontology().taxonomies()
49
+
50
+
51
+ def fields_for(type_name: str) -> Dict[str, Any]:
52
+ return _ontology().fields_for(type_name)
53
+
54
+
55
+ def format_field(name: str, meta: Dict[str, Any]) -> str:
56
+ return Ontology.format_field(name, meta)
57
+
58
+
59
+ def filter_fields(
60
+ fields: Dict[str, Any], search: Optional[str], include_deprecated: bool = False
61
+ ) -> List[Tuple[str, Dict[str, Any]]]:
62
+ return Ontology.filter_fields(fields, search, include_deprecated=include_deprecated)
63
+
64
+
65
+ def taxonomy_values(name: str, search: Optional[str] = None) -> List[str]:
66
+ return _ontology().taxonomy_values(name, search)
67
+
68
+
69
+ def enum_values(name: str) -> List[str]:
70
+ return _ontology().enum_values(name)
71
+
72
+
73
+ def find_named(search: str) -> List[str]:
74
+ return _ontology().find_named(search)
@@ -24,9 +24,13 @@ from .crawl import (
24
24
  from .kg import (
25
25
  dql as _dql,
26
26
  dql_async as _dql_async,
27
+ dql_fetch_ontology as _dql_fetch_ontology,
28
+ dql_fetch_ontology_async as _dql_fetch_ontology_async,
27
29
  dql_parallel as _dql_parallel,
30
+ dql_parallel_async as _dql_parallel_async,
28
31
  dql_refresh_ontology as _dql_refresh_ontology,
29
32
  )
33
+ from .ontology import Ontology
30
34
  from .web_search import (
31
35
  WEB_SEARCH_BASE,
32
36
  web_search as _web_search,
@@ -48,8 +52,8 @@ class Diffbot:
48
52
  """Client for the Diffbot APIs.
49
53
 
50
54
  Example:
51
- >>> from diffbot import Diffbot
52
- >>> db = Diffbot(token=os.getenv("DIFFBOT_API_TOKEN"))
55
+ >>> from diffbot import Diffbot, resolve_token
56
+ >>> db = Diffbot(token=resolve_token()) # env var or ~/.diffbot/credentials
53
57
  >>> db.extract("https://example.com")
54
58
  """
55
59
 
@@ -155,6 +159,10 @@ class Diffbot:
155
159
  """Download the Diffbot Knowledge Graph ontology and write it to dest."""
156
160
  _dql_refresh_ontology(self, dest)
157
161
 
162
+ def dql_fetch_ontology(self) -> Ontology:
163
+ """Download the ontology and return it as a queryable Ontology (no caching)."""
164
+ return _dql_fetch_ontology(self)
165
+
158
166
  def web_search(self, text: str, *, num_results: Optional[int] = None, max_tokens: Optional[int] = None) -> Dict[str, Any]:
159
167
  """Search the web via the Diffbot LLM web search API."""
160
168
  return _web_search(self, text, num_results=num_results, max_tokens=max_tokens)
@@ -272,6 +280,14 @@ class DiffbotAsync:
272
280
  """
273
281
  return await _dql_async(self, query, size=size, from_=from_, format=format, filter=filter, exportspec=exportspec, extra=extra, raw=raw)
274
282
 
283
+ async def dql_parallel(self, queries: Sequence[Dict[str, Any]], *, workers: int = 8) -> List[Union[Dict[str, Any], bytes]]:
284
+ """Run multiple DQL queries concurrently. Each item is a dict of dql() keyword args."""
285
+ return await _dql_parallel_async(self, queries, workers=workers)
286
+
287
+ async def dql_fetch_ontology(self) -> Ontology:
288
+ """Download the ontology and return it as a queryable Ontology (no caching)."""
289
+ return await _dql_fetch_ontology_async(self)
290
+
275
291
  async def web_search(self, text: str, *, num_results: Optional[int] = None, max_tokens: Optional[int] = None) -> Dict[str, Any]:
276
292
  """Search the web via the Diffbot LLM web search API."""
277
293
  return await _web_search_async(self, text, num_results=num_results, max_tokens=max_tokens)
@@ -1,9 +1,12 @@
1
1
  """Diffbot Knowledge Graph APIs: DQL search and entity enhancement."""
2
2
 
3
+ import asyncio
3
4
  import pathlib
4
5
  from concurrent.futures import ThreadPoolExecutor
5
6
  from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Union
6
7
 
8
+ from .ontology import Ontology
9
+
7
10
  if TYPE_CHECKING:
8
11
  from .client import Diffbot, DiffbotAsync
9
12
 
@@ -83,8 +86,43 @@ def dql_parallel(
83
86
  return list(ex.map(lambda q: dql(client, **q), queries))
84
87
 
85
88
 
89
+ async def dql_parallel_async(
90
+ client: "DiffbotAsync",
91
+ queries: Sequence[Dict[str, Any]],
92
+ *,
93
+ workers: int = 8,
94
+ ) -> List[Union[Dict[str, Any], bytes]]:
95
+ if not queries:
96
+ return []
97
+ sem = asyncio.Semaphore(workers)
98
+
99
+ async def _one(q: Dict[str, Any]) -> Union[Dict[str, Any], bytes]:
100
+ async with sem:
101
+ return await dql_async(client, **q)
102
+
103
+ return await asyncio.gather(*(_one(q) for q in queries))
104
+
105
+
86
106
  def dql_refresh_ontology(client: "Diffbot", dest: pathlib.Path) -> None:
87
107
  response = client._http.get(KG_ONTOLOGY_ENDPOINT)
88
108
  client._raise_for_status(response)
89
109
  dest.parent.mkdir(parents=True, exist_ok=True)
90
110
  dest.write_bytes(response.content)
111
+
112
+
113
+ def dql_fetch_ontology(client: "Diffbot") -> Ontology:
114
+ """Download the ontology and return it as a queryable :class:`Ontology`.
115
+
116
+ Performs no caching — the caller decides whether and where to hold onto the
117
+ result. Use :func:`dql_refresh_ontology` instead to persist raw bytes to disk.
118
+ """
119
+ response = client._http.get(KG_ONTOLOGY_ENDPOINT)
120
+ client._raise_for_status(response)
121
+ return Ontology.from_json(response.content)
122
+
123
+
124
+ async def dql_fetch_ontology_async(client: "DiffbotAsync") -> Ontology:
125
+ """Async variant of :func:`dql_fetch_ontology`."""
126
+ response = await client._http.get(KG_ONTOLOGY_ENDPOINT)
127
+ client._raise_for_status(response)
128
+ return Ontology.from_json(response.content)
@@ -0,0 +1,160 @@
1
+ """In-memory navigation of the Diffbot Knowledge Graph ontology.
2
+
3
+ The ontology is a JSON document describing the Knowledge Graph's entity types,
4
+ composite types, enums, and taxonomies. An agent constructing DQL needs it to
5
+ look up real field paths and taxonomy values instead of guessing them.
6
+
7
+ This module is pure and storage-agnostic: build an :class:`Ontology` from
8
+ already-parsed data (or from raw JSON / a file path) and query it. How the
9
+ ontology document is fetched, and whether or where it is cached, is left
10
+ entirely to the caller — the `db` CLI caches it on disk at
11
+ ``~/.diffbot/ontology.json``; an in-process consumer (e.g. langchain) can cache
12
+ the :class:`Ontology` in memory. Fetch a fresh one over HTTP with
13
+ :meth:`diffbot.Diffbot.dql_fetch_ontology`.
14
+ """
15
+
16
+ import json
17
+ import pathlib
18
+ import re
19
+ from typing import Any, Dict, List, Optional, Tuple, Union
20
+
21
+
22
+ class Ontology:
23
+ """Queryable view over a parsed Diffbot ontology document.
24
+
25
+ The instance holds the parsed document on :attr:`data` and exposes pure
26
+ lookup methods over it. Nothing here performs I/O — construct with already
27
+ parsed data, or use :meth:`from_json` / :meth:`from_path` for convenience.
28
+ """
29
+
30
+ def __init__(self, data: Dict[str, Any]):
31
+ self.data = data
32
+
33
+ @classmethod
34
+ def from_json(cls, raw: Union[str, bytes]) -> "Ontology":
35
+ """Build from a raw JSON string or bytes (e.g. an HTTP response body)."""
36
+ return cls(json.loads(raw))
37
+
38
+ @classmethod
39
+ def from_path(cls, path: Union[str, pathlib.Path]) -> "Ontology":
40
+ """Build from a JSON file on disk."""
41
+ return cls(json.loads(pathlib.Path(path).read_text()))
42
+
43
+ def types(self) -> List[str]:
44
+ """All entity type names (e.g. ``Organization``, ``Person``)."""
45
+ return sorted(self.data.get("types", {}).keys())
46
+
47
+ def composites(self) -> List[str]:
48
+ """All composite type names (e.g. ``Location``, ``Employment``)."""
49
+ return sorted(self.data.get("composites", {}).keys())
50
+
51
+ def enums(self) -> List[str]:
52
+ """All enum type names (e.g. ``Language``, ``Gender``)."""
53
+ return sorted(self.data.get("enums", {}).keys())
54
+
55
+ def taxonomies(self) -> List[str]:
56
+ """All taxonomy names (e.g. ``OrganizationCategory``)."""
57
+ return sorted(self.data.get("taxonomies", {}).keys())
58
+
59
+ @staticmethod
60
+ def _fields_of(container: Dict[str, Any], type_name: str) -> Dict[str, Any]:
61
+ entry = container.get(type_name)
62
+ if entry is None:
63
+ raise KeyError(f"Unknown name: {type_name}")
64
+ return entry.get("fields", {})
65
+
66
+ def fields_for(self, type_name: str) -> Dict[str, Any]:
67
+ """Return the field map of an entity type or composite.
68
+
69
+ Auto-routes: ``type_name`` may be an entity type (``Organization``) or a
70
+ composite (``Location``). Raises ``KeyError`` if it is neither.
71
+ """
72
+ types = self.data.get("types", {})
73
+ composites = self.data.get("composites", {})
74
+ if type_name in types:
75
+ return self._fields_of(types, type_name)
76
+ if type_name in composites:
77
+ return self._fields_of(composites, type_name)
78
+ raise KeyError(f"{type_name} is not a known entity type or composite")
79
+
80
+ @staticmethod
81
+ def filter_fields(
82
+ fields: Dict[str, Any],
83
+ search: Optional[str],
84
+ include_deprecated: bool = False,
85
+ ) -> List[Tuple[str, Dict[str, Any]]]:
86
+ """Filter a field map by a name regex, dropping deprecated by default."""
87
+ pattern = re.compile(search, re.IGNORECASE) if search else None
88
+ out = []
89
+ for name, meta in fields.items():
90
+ if not include_deprecated and meta.get("isDeprecated"):
91
+ continue
92
+ if pattern and not pattern.search(name):
93
+ continue
94
+ out.append((name, meta))
95
+ return out
96
+
97
+ def taxonomy_values(self, name: str, search: Optional[str] = None) -> List[str]:
98
+ """Flatten a taxonomy's values (recursing into children), optionally filtered."""
99
+ tax = self.data.get("taxonomies", {}).get(name)
100
+ if tax is None:
101
+ raise KeyError(f"Unknown taxonomy: {name}")
102
+ pattern = re.compile(search, re.IGNORECASE) if search else None
103
+ out: List[str] = []
104
+
105
+ def walk(node: Dict[str, Any]) -> None:
106
+ n = node.get("name")
107
+ if n and (pattern is None or pattern.search(n)):
108
+ out.append(n)
109
+ for child in node.get("children", []) or []:
110
+ walk(child)
111
+
112
+ for cat in tax.get("categories", []) or []:
113
+ walk(cat)
114
+ return out
115
+
116
+ def enum_values(self, name: str) -> List[str]:
117
+ """Return the allowed values of an enum."""
118
+ enum = self.data.get("enums", {}).get(name)
119
+ if enum is None:
120
+ raise KeyError(f"Unknown enum: {name}")
121
+ return list(enum.get("values", []))
122
+
123
+ def find_named(self, search: str) -> List[str]:
124
+ """Fallback search: every ``name`` anywhere in the document matching a regex."""
125
+ pattern = re.compile(search, re.IGNORECASE)
126
+ found = set()
127
+
128
+ def walk(node: Any) -> None:
129
+ if isinstance(node, dict):
130
+ n = node.get("name")
131
+ if isinstance(n, str) and pattern.search(n):
132
+ found.add(n)
133
+ for v in node.values():
134
+ walk(v)
135
+ elif isinstance(node, list):
136
+ for v in node:
137
+ walk(v)
138
+
139
+ walk(self.data)
140
+ return sorted(found)
141
+
142
+ @staticmethod
143
+ def format_field(name: str, meta: Dict[str, Any]) -> str:
144
+ """Render one field as ``<name>: [<type>] [flags...]`` for display."""
145
+ t = meta.get("type", "?")
146
+ if t == "LinkedEntity":
147
+ le = meta.get("leType") or []
148
+ if le:
149
+ t = f"LinkedEntity ({le[0]})"
150
+ flags = []
151
+ if meta.get("isList"):
152
+ flags.append("isList")
153
+ if meta.get("isComposite"):
154
+ flags.append("isComposite")
155
+ if meta.get("isEnum"):
156
+ flags.append("isEnum")
157
+ if meta.get("isDeprecated"):
158
+ flags.append("DEPRECATED")
159
+ suffix = "".join(f" [{f}]" for f in flags)
160
+ return f"{name}: [{t}]{suffix}"
@@ -18,7 +18,7 @@ def web_search(
18
18
  headers = {"Authorization": f"Bearer {client.token}"}
19
19
  params: Dict[str, Any] = {"text": text}
20
20
  if num_results is not None:
21
- params["num_results"] = num_results
21
+ params["size"] = num_results
22
22
  if max_tokens is not None:
23
23
  params["maxTokens"] = max_tokens
24
24
  response = client._http.get(client.web_search_url, headers=headers, params=params)
@@ -36,7 +36,7 @@ async def web_search_async(
36
36
  headers = {"Authorization": f"Bearer {client.token}"}
37
37
  params: Dict[str, Any] = {"text": text}
38
38
  if num_results is not None:
39
- params["num_results"] = num_results
39
+ params["size"] = num_results
40
40
  if max_tokens is not None:
41
41
  params["maxTokens"] = max_tokens
42
42
  response = await client._http.get(client.web_search_url, headers=headers, params=params)
@@ -1,15 +1,13 @@
1
- import os
2
-
3
1
  import pytest
4
2
 
5
- from diffbot import Diffbot
3
+ from diffbot import Diffbot, resolve_token
6
4
 
7
5
 
8
6
  @pytest.fixture(scope="session")
9
7
  def live_token():
10
- token = os.environ.get("DIFFBOT_TOKEN")
8
+ token = resolve_token()
11
9
  if not token:
12
- pytest.skip("DIFFBOT_TOKEN not set")
10
+ pytest.skip("no Diffbot token found (set DIFFBOT_API_TOKEN or ~/.diffbot/credentials)")
13
11
  return token
14
12
 
15
13
 
@@ -1,7 +1,15 @@
1
1
  import httpx
2
2
  import pytest
3
3
 
4
- from diffbot import APIError, AuthError, Diffbot, ExtractionError, RateLimitError, ValidationError
4
+ from diffbot import (
5
+ APIError,
6
+ AuthError,
7
+ Diffbot,
8
+ ExtractionError,
9
+ RateLimitError,
10
+ ValidationError,
11
+ resolve_token,
12
+ )
5
13
 
6
14
 
7
15
  """
@@ -40,6 +48,36 @@ def test_token_required():
40
48
  Diffbot(token="")
41
49
 
42
50
 
51
+ def test_resolve_token_explicit_wins(monkeypatch, tmp_path):
52
+ # An explicit token takes precedence over env var and file.
53
+ monkeypatch.setenv("DIFFBOT_API_TOKEN", "env-token")
54
+ assert resolve_token("explicit-token") == "explicit-token"
55
+
56
+
57
+ def test_resolve_token_from_env(monkeypatch, tmp_path):
58
+ # A token in the environment is returned when none is passed.
59
+ monkeypatch.setenv("DIFFBOT_API_TOKEN", "env-token")
60
+ monkeypatch.setattr("diffbot._auth.CREDENTIALS_PATH", tmp_path / "missing")
61
+ assert resolve_token() == "env-token"
62
+ # And can be used to build a client.
63
+ assert Diffbot(token=resolve_token()).token == "env-token"
64
+
65
+
66
+ def test_resolve_token_from_credentials_file(monkeypatch, tmp_path):
67
+ # Falls back to ~/.diffbot/credentials when no env var is set.
68
+ monkeypatch.delenv("DIFFBOT_API_TOKEN", raising=False)
69
+ creds = tmp_path / "credentials"
70
+ creds.write_text("DIFFBOT_API_TOKEN=file-token\n")
71
+ monkeypatch.setattr("diffbot._auth.CREDENTIALS_PATH", creds)
72
+ assert resolve_token() == "file-token"
73
+
74
+
75
+ def test_resolve_token_missing_returns_empty(monkeypatch, tmp_path):
76
+ monkeypatch.delenv("DIFFBOT_API_TOKEN", raising=False)
77
+ monkeypatch.setattr("diffbot._auth.CREDENTIALS_PATH", tmp_path / "missing")
78
+ assert resolve_token() == ""
79
+
80
+
43
81
  def test_user_agent_header():
44
82
  captured = {}
45
83
 
@@ -0,0 +1,119 @@
1
+ import json
2
+
3
+ import httpx
4
+ import pytest
5
+
6
+ from diffbot import Diffbot, DiffbotAsync, Ontology
7
+
8
+ FIXTURE_ONTOLOGY = {
9
+ "types": {
10
+ "Organization": {
11
+ "fields": {
12
+ "name": {"type": "String"},
13
+ "location": {"type": "Location", "isComposite": True},
14
+ "oldField": {"type": "String", "isDeprecated": True},
15
+ }
16
+ },
17
+ "Person": {"fields": {"name": {"type": "String"}}},
18
+ },
19
+ "composites": {
20
+ "Location": {"fields": {"city": {"type": "City", "isComposite": True}}},
21
+ },
22
+ "enums": {"Language": {"values": ["EN", "FR", "DE"]}},
23
+ "taxonomies": {
24
+ "OrganizationCategory": {
25
+ "categories": [
26
+ {"name": "Technology", "children": [{"name": "Semiconductor Companies"}]},
27
+ ]
28
+ }
29
+ },
30
+ }
31
+
32
+
33
+ @pytest.fixture
34
+ def ont() -> Ontology:
35
+ return Ontology(FIXTURE_ONTOLOGY)
36
+
37
+
38
+ def test_navigation_helpers(ont):
39
+ assert ont.types() == ["Organization", "Person"]
40
+ assert ont.composites() == ["Location"]
41
+ assert ont.enums() == ["Language"]
42
+ assert ont.taxonomies() == ["OrganizationCategory"]
43
+ assert ont.enum_values("Language") == ["EN", "FR", "DE"]
44
+ assert ont.taxonomy_values("OrganizationCategory", "semi") == ["Semiconductor Companies"]
45
+ assert ont.find_named("compan") == ["Semiconductor Companies"]
46
+
47
+
48
+ def test_fields_for_routes_types_and_composites(ont):
49
+ assert "name" in ont.fields_for("Organization")
50
+ assert "city" in ont.fields_for("Location")
51
+ with pytest.raises(KeyError):
52
+ ont.fields_for("NopeType")
53
+
54
+
55
+ def test_filter_fields_drops_deprecated_by_default(ont):
56
+ fields = ont.fields_for("Organization")
57
+ names = [n for n, _ in Ontology.filter_fields(fields, None)]
58
+ assert "oldField" not in names
59
+ names_incl = [n for n, _ in Ontology.filter_fields(fields, None, include_deprecated=True)]
60
+ assert "oldField" in names_incl
61
+
62
+
63
+ def test_format_field(ont):
64
+ fields = ont.fields_for("Organization")
65
+ assert Ontology.format_field("location", fields["location"]) == "location: [Location] [isComposite]"
66
+
67
+
68
+ def test_from_json_and_from_path(tmp_path):
69
+ raw = json.dumps(FIXTURE_ONTOLOGY)
70
+ assert Ontology.from_json(raw).types() == ["Organization", "Person"]
71
+ path = tmp_path / "ontology.json"
72
+ path.write_text(raw)
73
+ assert Ontology.from_path(path).enums() == ["Language"]
74
+
75
+
76
+ def test_unknown_taxonomy_and_enum_raise(ont):
77
+ with pytest.raises(KeyError):
78
+ ont.taxonomy_values("Nope")
79
+ with pytest.raises(KeyError):
80
+ ont.enum_values("Nope")
81
+
82
+
83
+ def test_dql_fetch_ontology_returns_ontology():
84
+ def handler(request: httpx.Request) -> httpx.Response:
85
+ assert request.url.path.endswith("/ontology")
86
+ return httpx.Response(200, json=FIXTURE_ONTOLOGY)
87
+
88
+ db = Diffbot(token="test-token", transport=httpx.MockTransport(handler))
89
+ ont = db.dql_fetch_ontology()
90
+ assert isinstance(ont, Ontology)
91
+ assert ont.types() == ["Organization", "Person"]
92
+
93
+
94
+ @pytest.mark.anyio
95
+ async def test_async_dql_fetch_ontology_returns_ontology():
96
+ def handler(request: httpx.Request) -> httpx.Response:
97
+ return httpx.Response(200, json=FIXTURE_ONTOLOGY)
98
+
99
+ db = DiffbotAsync(token="test-token", transport=httpx.MockTransport(handler))
100
+ ont = await db.dql_fetch_ontology()
101
+ assert isinstance(ont, Ontology)
102
+ assert ont.composites() == ["Location"]
103
+
104
+
105
+ @pytest.mark.anyio
106
+ async def test_async_dql_parallel_runs_all_queries():
107
+ def handler(request: httpx.Request) -> httpx.Response:
108
+ q = request.url.params["query"]
109
+ hits = 5 if "Diffbot" in q else 100
110
+ return httpx.Response(200, json={"hits": hits, "results": 0})
111
+
112
+ db = DiffbotAsync(token="test-token", transport=httpx.MockTransport(handler))
113
+ results = await db.dql_parallel(
114
+ [
115
+ {"query": 'type:Organization name:"Diffbot"', "size": 0},
116
+ {"query": "type:Organization", "size": 0},
117
+ ]
118
+ )
119
+ assert [r["hits"] for r in results] == [5, 100]
@@ -2,7 +2,7 @@
2
2
  import httpx
3
3
  import pytest
4
4
 
5
- from diffbot import CrawlEventType, Diffbot, DiffbotAsync
5
+ from diffbot import CrawlEventType, Diffbot, DiffbotAsync, resolve_token
6
6
 
7
7
  SSE_PARIS = 'data: {"choices": [{"delta": {"content": "Paris"}}]}\n'
8
8
 
@@ -173,3 +173,21 @@ async def test_readme_async_entities():
173
173
  assert len(result["entities"]) == 2
174
174
  assert result["entities"][0]["name"] == "Apple"
175
175
  assert result["sentiment"] == 0.3
176
+
177
+
178
+ # ---------------------------------------------------------------------------
179
+ # Authentication
180
+ # ---------------------------------------------------------------------------
181
+
182
+ def test_readme_authentication_resolve_token(monkeypatch, tmp_path):
183
+ # README "Authentication": Diffbot(token=resolve_token()) using the env var.
184
+ monkeypatch.setenv("DIFFBOT_API_TOKEN", "test-token")
185
+ monkeypatch.setattr("diffbot._auth.CREDENTIALS_PATH", tmp_path / "missing")
186
+
187
+ def handler(request: httpx.Request) -> httpx.Response:
188
+ assert request.url.params["token"] == "test-token"
189
+ return httpx.Response(200, json={"objects": [{"title": "Example"}]})
190
+
191
+ db = Diffbot(token=resolve_token(), transport=httpx.MockTransport(handler))
192
+ data = db.extract("https://www.example.com")
193
+ assert "objects" in data
@@ -1,5 +0,0 @@
1
- # Agent Guidelines
2
-
3
- ## README Examples
4
-
5
- Whenever a code example in `README.md` is added or updated, the corresponding test must be added or updated in `tests/test_readme_examples.py`. Run `python -m pytest tests/test_readme_examples.py` to validate before considering the work complete.
@@ -1 +0,0 @@
1
- AGENTS.md
@@ -1,36 +0,0 @@
1
- import os
2
- import pathlib
3
-
4
- import click
5
-
6
- from diffbot import Diffbot
7
-
8
- CREDENTIALS_PATH = pathlib.Path.home() / ".diffbot" / "credentials"
9
-
10
-
11
- def resolve_token() -> str:
12
- """Return the Diffbot API token from the env var, falling back to ~/.diffbot/credentials."""
13
- token = os.environ.get("DIFFBOT_API_TOKEN", "").strip()
14
- if token:
15
- return token
16
-
17
- if CREDENTIALS_PATH.exists():
18
- for line in CREDENTIALS_PATH.read_text().splitlines():
19
- line = line.strip()
20
- if line.startswith("DIFFBOT_API_TOKEN="):
21
- return line[len("DIFFBOT_API_TOKEN="):].strip()
22
-
23
- return ""
24
-
25
-
26
- def get_client() -> Diffbot:
27
- token = resolve_token()
28
- if not token:
29
- click.echo(
30
- "Error: no Diffbot API token found.\n"
31
- " Set a DIFFBOT_API_TOKEN environment variable, or\n"
32
- f" write 'DIFFBOT_API_TOKEN=YOUR_TOKEN' to {CREDENTIALS_PATH}",
33
- err=True,
34
- )
35
- raise click.Abort()
36
- return Diffbot(token=token)
@@ -1,130 +0,0 @@
1
- import json
2
- import pathlib
3
- import re
4
- from typing import Any, Dict, List, Optional
5
-
6
- ONTOLOGY_PATH = pathlib.Path.home() / ".diffbot" / "ontology.json"
7
-
8
- _CACHE: Dict[str, Any] = {}
9
-
10
-
11
- def load() -> Dict[str, Any]:
12
- if "data" not in _CACHE:
13
- if not ONTOLOGY_PATH.exists():
14
- raise FileNotFoundError(
15
- f"Ontology not found at {ONTOLOGY_PATH}. Run: db dql init"
16
- )
17
- _CACHE["data"] = json.loads(ONTOLOGY_PATH.read_text())
18
- return _CACHE["data"]
19
-
20
-
21
- def list_types() -> List[str]:
22
- return sorted(load().get("types", {}).keys())
23
-
24
-
25
- def list_composites() -> List[str]:
26
- return sorted(load().get("composites", {}).keys())
27
-
28
-
29
- def list_enums() -> List[str]:
30
- return sorted(load().get("enums", {}).keys())
31
-
32
-
33
- def list_taxonomies() -> List[str]:
34
- return sorted(load().get("taxonomies", {}).keys())
35
-
36
-
37
- def _fields_of(container: Dict[str, Any], type_name: str) -> Dict[str, Any]:
38
- entry = container.get(type_name)
39
- if entry is None:
40
- raise KeyError(f"Unknown name: {type_name}")
41
- return entry.get("fields", {})
42
-
43
-
44
- def fields_for(type_name: str) -> Dict[str, Any]:
45
- data = load()
46
- types = data.get("types", {})
47
- composites = data.get("composites", {})
48
- if type_name in types:
49
- return _fields_of(types, type_name)
50
- if type_name in composites:
51
- return _fields_of(composites, type_name)
52
- raise KeyError(f"{type_name} is not a known entity type or composite")
53
-
54
-
55
- def format_field(name: str, meta: Dict[str, Any]) -> str:
56
- t = meta.get("type", "?")
57
- if t == "LinkedEntity":
58
- le = meta.get("leType") or []
59
- if le:
60
- t = f"LinkedEntity ({le[0]})"
61
- flags = []
62
- if meta.get("isList"):
63
- flags.append("isList")
64
- if meta.get("isComposite"):
65
- flags.append("isComposite")
66
- if meta.get("isEnum"):
67
- flags.append("isEnum")
68
- if meta.get("isDeprecated"):
69
- flags.append("DEPRECATED")
70
- suffix = "".join(f" [{f}]" for f in flags)
71
- return f"{name}: [{t}]{suffix}"
72
-
73
-
74
- def filter_fields(fields: Dict[str, Any], search: Optional[str], include_deprecated: bool = False) -> List[tuple]:
75
- pattern = re.compile(search, re.IGNORECASE) if search else None
76
- out = []
77
- for name, meta in fields.items():
78
- if not include_deprecated and meta.get("isDeprecated"):
79
- continue
80
- if pattern and not pattern.search(name):
81
- continue
82
- out.append((name, meta))
83
- return out
84
-
85
-
86
- def taxonomy_values(name: str, search: Optional[str] = None) -> List[str]:
87
- data = load()
88
- tax = data.get("taxonomies", {}).get(name)
89
- if tax is None:
90
- raise KeyError(f"Unknown taxonomy: {name}")
91
- pattern = re.compile(search, re.IGNORECASE) if search else None
92
- out: List[str] = []
93
-
94
- def walk(node: Dict[str, Any]) -> None:
95
- n = node.get("name")
96
- if n and (pattern is None or pattern.search(n)):
97
- out.append(n)
98
- for child in node.get("children", []) or []:
99
- walk(child)
100
-
101
- for cat in tax.get("categories", []) or []:
102
- walk(cat)
103
- return out
104
-
105
-
106
- def enum_values(name: str) -> List[str]:
107
- data = load()
108
- enum = data.get("enums", {}).get(name)
109
- if enum is None:
110
- raise KeyError(f"Unknown enum: {name}")
111
- return list(enum.get("values", []))
112
-
113
-
114
- def find_named(search: str) -> List[str]:
115
- pattern = re.compile(search, re.IGNORECASE)
116
- found = set()
117
-
118
- def walk(node: Any) -> None:
119
- if isinstance(node, dict):
120
- n = node.get("name")
121
- if isinstance(n, str) and pattern.search(n):
122
- found.add(n)
123
- for v in node.values():
124
- walk(v)
125
- elif isinstance(node, list):
126
- for v in node:
127
- walk(v)
128
-
129
- walk(load())
130
- return sorted(found)
File without changes