diffbot-python 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. diffbot_python-0.1.0/.gitignore +8 -0
  2. diffbot_python-0.1.0/AGENTS.md +5 -0
  3. diffbot_python-0.1.0/CLAUDE.md +1 -0
  4. diffbot_python-0.1.0/LICENSE +21 -0
  5. diffbot_python-0.1.0/PKG-INFO +218 -0
  6. diffbot_python-0.1.0/README.md +195 -0
  7. diffbot_python-0.1.0/pyproject.toml +48 -0
  8. diffbot_python-0.1.0/src/diffbot/__init__.py +29 -0
  9. diffbot_python-0.1.0/src/diffbot/ask.py +48 -0
  10. diffbot_python-0.1.0/src/diffbot/cli/__init__.py +399 -0
  11. diffbot_python-0.1.0/src/diffbot/cli/__main__.py +4 -0
  12. diffbot_python-0.1.0/src/diffbot/cli/_common.py +36 -0
  13. diffbot_python-0.1.0/src/diffbot/cli/dql.py +308 -0
  14. diffbot_python-0.1.0/src/diffbot/cli/entities.py +155 -0
  15. diffbot_python-0.1.0/src/diffbot/cli/ontology.py +130 -0
  16. diffbot_python-0.1.0/src/diffbot/client.py +285 -0
  17. diffbot_python-0.1.0/src/diffbot/crawl.py +270 -0
  18. diffbot_python-0.1.0/src/diffbot/errors.py +51 -0
  19. diffbot_python-0.1.0/src/diffbot/extract.py +45 -0
  20. diffbot_python-0.1.0/src/diffbot/kg.py +90 -0
  21. diffbot_python-0.1.0/src/diffbot/nlp.py +37 -0
  22. diffbot_python-0.1.0/src/diffbot/web_search.py +44 -0
  23. diffbot_python-0.1.0/tests/conftest.py +18 -0
  24. diffbot_python-0.1.0/tests/test_ask.py +12 -0
  25. diffbot_python-0.1.0/tests/test_async.py +94 -0
  26. diffbot_python-0.1.0/tests/test_crawl.py +63 -0
  27. diffbot_python-0.1.0/tests/test_dql.py +14 -0
  28. diffbot_python-0.1.0/tests/test_dql_cli.py +118 -0
  29. diffbot_python-0.1.0/tests/test_extract.py +132 -0
  30. diffbot_python-0.1.0/tests/test_readme_examples.py +175 -0
  31. diffbot_python-0.1.0/tests/test_web_search.py +12 -0
@@ -0,0 +1,8 @@
1
+ # Python
2
+ .venv
3
+ __pycache__
4
+ .pytest_cache
5
+ .env
6
+
7
+ # Claude
8
+ .claude/settings.local.json
@@ -0,0 +1,5 @@
1
+ # Agent Guidelines
2
+
3
+ ## README Examples
4
+
5
+ Whenever a code example in `README.md` is added or updated, the corresponding test must be added or updated in `tests/test_readme_examples.py`. Run `python -m pytest tests/test_readme_examples.py` to validate before considering the work complete.
@@ -0,0 +1 @@
1
+ AGENTS.md
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Diffbot
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,218 @@
1
+ Metadata-Version: 2.4
2
+ Name: diffbot-python
3
+ Version: 0.1.0
4
+ Summary: Python client library for Diffbot APIs
5
+ Project-URL: Homepage, https://github.com/diffbot/diffbot-python
6
+ Project-URL: Repository, https://github.com/diffbot/diffbot-python
7
+ Project-URL: Issues, https://github.com/diffbot/diffbot-python/issues
8
+ Author-email: Jerome Choo <jerome@diffbot.com>, Mike Tung <miket@diffbot.com>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Classifier: Topic :: Software Development :: Libraries
16
+ Requires-Python: >=3.10
17
+ Requires-Dist: click>=8.1.0
18
+ Requires-Dist: httpx>=0.27.0
19
+ Requires-Dist: rich>=13.0.0
20
+ Provides-Extra: dev
21
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
22
+ Description-Content-Type: text/markdown
23
+
24
+ # Diffbot Python Library
25
+
26
+ Python client library for [Diffbot](https://www.diffbot.com) APIs.
27
+
28
+
29
+ ## Installation
30
+
31
+ ```bash
32
+ pip install git+https://github.com/diffbot/diffbot-python.git
33
+ ```
34
+
35
+ Or, for local development:
36
+
37
+ ```bash
38
+ pip install -e ".[dev]"
39
+ ```
40
+
41
+ ## Usage
42
+
43
+ ### Authentication
44
+ Set your Diffbot API token in your environment or .env.
45
+
46
+ ```bash
47
+ export DIFFBOT_API_TOKEN=<TOKEN>
48
+ ```
49
+
50
+ ### Extract structured content
51
+ ```python
52
+ from diffbot import Diffbot
53
+
54
+ db = Diffbot(token="YOUR_TOKEN")
55
+ data = db.extract("https://www.example.com")
56
+ ```
57
+
58
+ ### Ask Diffbot LLM
59
+ ```python
60
+ from diffbot import Diffbot
61
+
62
+ db = Diffbot(token="YOUR_TOKEN")
63
+ for chunk in db.ask([{"role": "user", "content": "What's the capital of France?"}]):
64
+ print(chunk, end="")
65
+ ```
66
+
67
+ ### Crawl a site for structured content
68
+ ```python
69
+ from diffbot import Diffbot
70
+
71
+ db = Diffbot(token="YOUR_TOKEN")
72
+ for event in db.crawl("https://www.example.com", hops=1):
73
+ print(event)
74
+ ```
75
+
76
+ ### Query the Knowledge Graph
77
+ ```python
78
+ from diffbot import Diffbot
79
+
80
+ db = Diffbot(token="YOUR_TOKEN")
81
+ results = db.dql('type:Organization name:"Diffbot"')
82
+ ```
83
+
84
+ ### Web Search
85
+ ```python
86
+ from diffbot import Diffbot
87
+
88
+ db = Diffbot(token="YOUR_TOKEN")
89
+ results = db.web_search("diffbot knowledge graph")
90
+ for r in results["search_results"]:
91
+ print(r["score"], r["title"], r["pageUrl"])
92
+ print(r["content"])
93
+ ```
94
+
95
+ ### Entities (NLP)
96
+ ```python
97
+ from diffbot import Diffbot
98
+
99
+ db = Diffbot(token="YOUR_TOKEN")
100
+ result = db.entities("Apple CEO Tim Cook announced record quarterly earnings.")
101
+ for entity in result["entities"]:
102
+ print(entity["name"], entity.get("type"), entity.get("id"))
103
+ print("sentiment:", result.get("sentiment"))
104
+ ```
105
+
106
+ ## Async Usage
107
+
108
+ ### Extract structured content
109
+ ```python
110
+ import asyncio
111
+ from diffbot import DiffbotAsync
112
+
113
+ async def main():
114
+ async with DiffbotAsync(token="YOUR_TOKEN") as db:
115
+ data = await db.extract("https://www.example.com")
116
+ print(data)
117
+
118
+ asyncio.run(main())
119
+ ```
120
+
121
+ ### Ask Diffbot LLM
122
+ ```python
123
+ import asyncio
124
+ from diffbot import DiffbotAsync
125
+
126
+ async def main():
127
+ async with DiffbotAsync(token="YOUR_TOKEN") as db:
128
+ async for chunk in db.ask([{"role": "user", "content": "What's the capital of France?"}]):
129
+ print(chunk, end="")
130
+
131
+ asyncio.run(main())
132
+ ```
133
+
134
+ ### Crawl a site for structured content
135
+ ```python
136
+ import asyncio
137
+ from diffbot import DiffbotAsync
138
+
139
+ async def main():
140
+ async with DiffbotAsync(token="YOUR_TOKEN") as db:
141
+ async for event in db.crawl("https://www.example.com", hops=1):
142
+ print(event)
143
+
144
+ asyncio.run(main())
145
+ ```
146
+
147
+ ### Query the Knowledge Graph
148
+ ```python
149
+ import asyncio
150
+ from diffbot import DiffbotAsync
151
+
152
+ async def main():
153
+ async with DiffbotAsync(token="YOUR_TOKEN") as db:
154
+ results = await db.dql('type:Organization name:"Diffbot"')
155
+ print(results)
156
+
157
+ asyncio.run(main())
158
+ ```
159
+
160
+ ### Web Search
161
+ ```python
162
+ import asyncio
163
+ from diffbot import DiffbotAsync
164
+
165
+ async def main():
166
+ async with DiffbotAsync(token="YOUR_TOKEN") as db:
167
+ results = await db.web_search("diffbot knowledge graph")
168
+ for r in results["search_results"]:
169
+ print(r["score"], r["title"], r["pageUrl"])
170
+ print(r["content"])
171
+
172
+ asyncio.run(main())
173
+ ```
174
+
175
+ ### Entities (NLP)
176
+ ```python
177
+ import asyncio
178
+ from diffbot import DiffbotAsync
179
+
180
+ async def main():
181
+ async with DiffbotAsync(token="YOUR_TOKEN") as db:
182
+ result = await db.entities("Apple CEO Tim Cook announced record quarterly earnings.")
183
+ for entity in result["entities"]:
184
+ print(entity["name"], entity.get("type"), entity.get("id"))
185
+ print("sentiment:", result.get("sentiment"))
186
+
187
+ asyncio.run(main())
188
+ ```
189
+
190
+ ## CLI
191
+
192
+ This library also includes a CLI.
193
+
194
+ ```bash
195
+ export DIFFBOT_API_TOKEN=your-token-here
196
+
197
+ db extract https://www.example.com
198
+ db ask "What's the capital of France?"
199
+ db crawl https://www.example.com --hops 1
200
+ db crawl-list-jobs
201
+ db crawl-delete-job crawl-1234567890
202
+ db web-search "diffbot knowledge graph"
203
+ db web-search "diffbot knowledge graph" -n 5 -f json
204
+ db entities "Apple CEO Tim Cook announced record quarterly earnings."
205
+ db entities "Apple CEO Tim Cook announced record quarterly earnings." -f dql
206
+ ```
207
+
208
+ ## Tests
209
+
210
+ Run the mock test suite:
211
+ ```bash
212
+ python -m pytest
213
+ ```
214
+
215
+ Run live integration tests against the real API (requires a valid token):
216
+ ```bash
217
+ DIFFBOT_TOKEN=your_token python -m pytest -m live
218
+ ```
@@ -0,0 +1,195 @@
1
+ # Diffbot Python Library
2
+
3
+ Python client library for [Diffbot](https://www.diffbot.com) APIs.
4
+
5
+
6
+ ## Installation
7
+
8
+ ```bash
9
+ pip install git+https://github.com/diffbot/diffbot-python.git
10
+ ```
11
+
12
+ Or, for local development:
13
+
14
+ ```bash
15
+ pip install -e ".[dev]"
16
+ ```
17
+
18
+ ## Usage
19
+
20
+ ### Authentication
21
+ Set your Diffbot API token in your environment or .env.
22
+
23
+ ```bash
24
+ export DIFFBOT_API_TOKEN=<TOKEN>
25
+ ```
26
+
27
+ ### Extract structured content
28
+ ```python
29
+ from diffbot import Diffbot
30
+
31
+ db = Diffbot(token="YOUR_TOKEN")
32
+ data = db.extract("https://www.example.com")
33
+ ```
34
+
35
+ ### Ask Diffbot LLM
36
+ ```python
37
+ from diffbot import Diffbot
38
+
39
+ db = Diffbot(token="YOUR_TOKEN")
40
+ for chunk in db.ask([{"role": "user", "content": "What's the capital of France?"}]):
41
+ print(chunk, end="")
42
+ ```
43
+
44
+ ### Crawl a site for structured content
45
+ ```python
46
+ from diffbot import Diffbot
47
+
48
+ db = Diffbot(token="YOUR_TOKEN")
49
+ for event in db.crawl("https://www.example.com", hops=1):
50
+ print(event)
51
+ ```
52
+
53
+ ### Query the Knowledge Graph
54
+ ```python
55
+ from diffbot import Diffbot
56
+
57
+ db = Diffbot(token="YOUR_TOKEN")
58
+ results = db.dql('type:Organization name:"Diffbot"')
59
+ ```
60
+
61
+ ### Web Search
62
+ ```python
63
+ from diffbot import Diffbot
64
+
65
+ db = Diffbot(token="YOUR_TOKEN")
66
+ results = db.web_search("diffbot knowledge graph")
67
+ for r in results["search_results"]:
68
+ print(r["score"], r["title"], r["pageUrl"])
69
+ print(r["content"])
70
+ ```
71
+
72
+ ### Entities (NLP)
73
+ ```python
74
+ from diffbot import Diffbot
75
+
76
+ db = Diffbot(token="YOUR_TOKEN")
77
+ result = db.entities("Apple CEO Tim Cook announced record quarterly earnings.")
78
+ for entity in result["entities"]:
79
+ print(entity["name"], entity.get("type"), entity.get("id"))
80
+ print("sentiment:", result.get("sentiment"))
81
+ ```
82
+
83
+ ## Async Usage
84
+
85
+ ### Extract structured content
86
+ ```python
87
+ import asyncio
88
+ from diffbot import DiffbotAsync
89
+
90
+ async def main():
91
+ async with DiffbotAsync(token="YOUR_TOKEN") as db:
92
+ data = await db.extract("https://www.example.com")
93
+ print(data)
94
+
95
+ asyncio.run(main())
96
+ ```
97
+
98
+ ### Ask Diffbot LLM
99
+ ```python
100
+ import asyncio
101
+ from diffbot import DiffbotAsync
102
+
103
+ async def main():
104
+ async with DiffbotAsync(token="YOUR_TOKEN") as db:
105
+ async for chunk in db.ask([{"role": "user", "content": "What's the capital of France?"}]):
106
+ print(chunk, end="")
107
+
108
+ asyncio.run(main())
109
+ ```
110
+
111
+ ### Crawl a site for structured content
112
+ ```python
113
+ import asyncio
114
+ from diffbot import DiffbotAsync
115
+
116
+ async def main():
117
+ async with DiffbotAsync(token="YOUR_TOKEN") as db:
118
+ async for event in db.crawl("https://www.example.com", hops=1):
119
+ print(event)
120
+
121
+ asyncio.run(main())
122
+ ```
123
+
124
+ ### Query the Knowledge Graph
125
+ ```python
126
+ import asyncio
127
+ from diffbot import DiffbotAsync
128
+
129
+ async def main():
130
+ async with DiffbotAsync(token="YOUR_TOKEN") as db:
131
+ results = await db.dql('type:Organization name:"Diffbot"')
132
+ print(results)
133
+
134
+ asyncio.run(main())
135
+ ```
136
+
137
+ ### Web Search
138
+ ```python
139
+ import asyncio
140
+ from diffbot import DiffbotAsync
141
+
142
+ async def main():
143
+ async with DiffbotAsync(token="YOUR_TOKEN") as db:
144
+ results = await db.web_search("diffbot knowledge graph")
145
+ for r in results["search_results"]:
146
+ print(r["score"], r["title"], r["pageUrl"])
147
+ print(r["content"])
148
+
149
+ asyncio.run(main())
150
+ ```
151
+
152
+ ### Entities (NLP)
153
+ ```python
154
+ import asyncio
155
+ from diffbot import DiffbotAsync
156
+
157
+ async def main():
158
+ async with DiffbotAsync(token="YOUR_TOKEN") as db:
159
+ result = await db.entities("Apple CEO Tim Cook announced record quarterly earnings.")
160
+ for entity in result["entities"]:
161
+ print(entity["name"], entity.get("type"), entity.get("id"))
162
+ print("sentiment:", result.get("sentiment"))
163
+
164
+ asyncio.run(main())
165
+ ```
166
+
167
+ ## CLI
168
+
169
+ This library also includes a CLI.
170
+
171
+ ```bash
172
+ export DIFFBOT_API_TOKEN=your-token-here
173
+
174
+ db extract https://www.example.com
175
+ db ask "What's the capital of France?"
176
+ db crawl https://www.example.com --hops 1
177
+ db crawl-list-jobs
178
+ db crawl-delete-job crawl-1234567890
179
+ db web-search "diffbot knowledge graph"
180
+ db web-search "diffbot knowledge graph" -n 5 -f json
181
+ db entities "Apple CEO Tim Cook announced record quarterly earnings."
182
+ db entities "Apple CEO Tim Cook announced record quarterly earnings." -f dql
183
+ ```
184
+
185
+ ## Tests
186
+
187
+ Run the mock test suite:
188
+ ```bash
189
+ python -m pytest
190
+ ```
191
+
192
+ Run live integration tests against the real API (requires a valid token):
193
+ ```bash
194
+ DIFFBOT_TOKEN=your_token python -m pytest -m live
195
+ ```
@@ -0,0 +1,48 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "diffbot-python"
7
+ version = "0.1.0"
8
+ description = "Python client library for Diffbot APIs"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ classifiers = [
12
+ "Programming Language :: Python :: 3",
13
+ "Operating System :: OS Independent",
14
+ "Topic :: Software Development :: Libraries",
15
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
16
+ "Topic :: Internet :: WWW/HTTP :: Indexing/Search"
17
+ ]
18
+ license = "MIT"
19
+ license-files = ["LICEN[CS]E*"]
20
+ authors = [
21
+ { name = "Jerome Choo", email = "jerome@diffbot.com" },
22
+ { name = "Mike Tung", email = "miket@diffbot.com" }
23
+ ]
24
+ dependencies = [
25
+ "httpx>=0.27.0",
26
+ "click>=8.1.0",
27
+ "rich>=13.0.0",
28
+ ]
29
+
30
+ [project.optional-dependencies]
31
+ dev = [
32
+ "pytest>=8.0.0",
33
+ ]
34
+
35
+ [project.urls]
36
+ Homepage = "https://github.com/diffbot/diffbot-python"
37
+ Repository = "https://github.com/diffbot/diffbot-python"
38
+ Issues = "https://github.com/diffbot/diffbot-python/issues"
39
+
40
+ [project.scripts]
41
+ db = "diffbot.cli:main"
42
+
43
+ [tool.hatch.build.targets.wheel]
44
+ packages = ["src/diffbot"]
45
+
46
+ [tool.pytest.ini_options]
47
+ markers = ["live: marks tests as live integration tests requiring a real DIFFBOT_TOKEN"]
48
+ addopts = "-m 'not live'"
@@ -0,0 +1,29 @@
1
+ """
2
+ diffbot - Python client library for the Diffbot APIs.
3
+ """
4
+
5
+ __version__ = "0.1.0"
6
+
7
+ from .client import Diffbot, DiffbotAsync
8
+ from .crawl import CrawlEvent, CrawlEventType
9
+ from .errors import (
10
+ APIError,
11
+ AuthError,
12
+ DiffbotError,
13
+ ExtractionError,
14
+ RateLimitError,
15
+ ValidationError,
16
+ )
17
+
18
+ __all__ = [
19
+ "Diffbot",
20
+ "DiffbotAsync",
21
+ "CrawlEvent",
22
+ "CrawlEventType",
23
+ "DiffbotError",
24
+ "AuthError",
25
+ "ExtractionError",
26
+ "RateLimitError",
27
+ "APIError",
28
+ "ValidationError",
29
+ ]
@@ -0,0 +1,48 @@
1
+ """Diffbot LLM RAG API: stream a chat completion."""
2
+
3
+ import json
4
+ from typing import TYPE_CHECKING, Any, AsyncIterator, Dict, Iterator, List
5
+
6
+ if TYPE_CHECKING:
7
+ from .client import Diffbot, DiffbotAsync
8
+
9
+
10
+ def _build_payload(client: Any, messages: List[Dict[str, str]]) -> tuple:
11
+ headers = {"Authorization": f"Bearer {client.token}"}
12
+ payload = {"model": "diffbot-small-xl", "messages": messages, "stream": True}
13
+ return headers, payload
14
+
15
+
16
+ def _parse_chunk(line: str):
17
+ try:
18
+ chunk = json.loads(line.replace("data: ", ""))
19
+ except json.JSONDecodeError:
20
+ return None
21
+ choices = chunk.get("choices")
22
+ if choices and choices[0].get("delta", {}).get("content"):
23
+ return choices[0]["delta"]["content"]
24
+ return None
25
+
26
+
27
+ def ask(client: "Diffbot", messages: List[Dict[str, str]]) -> Iterator[str]:
28
+ headers = {"Authorization": f"Bearer {client.token}"}
29
+ payload = {"model": "diffbot-small-xl", "messages": messages, "stream": True}
30
+ with client._http.stream("POST", client.llm_url, headers=headers, json=payload) as response:
31
+ client._raise_for_status(response)
32
+ for line in response.iter_lines():
33
+ if line:
34
+ content = _parse_chunk(line)
35
+ if content:
36
+ yield content
37
+
38
+
39
+ async def ask_async(client: "DiffbotAsync", messages: List[Dict[str, str]]) -> AsyncIterator[str]:
40
+ headers = {"Authorization": f"Bearer {client.token}"}
41
+ payload = {"model": "diffbot-small-xl", "messages": messages, "stream": True}
42
+ async with client._http.stream("POST", client.llm_url, headers=headers, json=payload) as response:
43
+ client._raise_for_status(response)
44
+ async for line in response.aiter_lines():
45
+ if line:
46
+ content = _parse_chunk(line)
47
+ if content:
48
+ yield content