nomic 3.5.2__tar.gz → 3.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nomic-3.8.0/MANIFEST.in +1 -0
- {nomic-3.5.2 → nomic-3.8.0}/PKG-INFO +1 -1
- nomic-3.8.0/nomic/__init__.py +10 -0
- {nomic-3.5.2 → nomic-3.8.0}/nomic/cli.py +1 -1
- nomic-3.8.0/nomic/client.py +430 -0
- nomic-3.8.0/nomic/client_models.py +103 -0
- {nomic-3.5.2 → nomic-3.8.0}/nomic/dataset.py +40 -1
- {nomic-3.5.2 → nomic-3.8.0}/nomic/embed.py +1 -1
- nomic-3.8.0/nomic/py.typed +0 -0
- {nomic-3.5.2 → nomic-3.8.0}/nomic.egg-info/PKG-INFO +1 -1
- {nomic-3.5.2 → nomic-3.8.0}/nomic.egg-info/SOURCES.txt +4 -0
- {nomic-3.5.2 → nomic-3.8.0}/nomic.egg-info/requires.txt +2 -1
- {nomic-3.5.2 → nomic-3.8.0}/setup.py +4 -2
- nomic-3.5.2/nomic/__init__.py +0 -2
- {nomic-3.5.2 → nomic-3.8.0}/README.md +0 -0
- {nomic-3.5.2 → nomic-3.8.0}/nomic/atlas.py +0 -0
- {nomic-3.5.2 → nomic-3.8.0}/nomic/aws/__init__.py +0 -0
- {nomic-3.5.2 → nomic-3.8.0}/nomic/aws/sagemaker.py +0 -0
- {nomic-3.5.2 → nomic-3.8.0}/nomic/data_inference.py +0 -0
- {nomic-3.5.2 → nomic-3.8.0}/nomic/data_operations.py +0 -0
- {nomic-3.5.2 → nomic-3.8.0}/nomic/pl_callbacks/__init__.py +0 -0
- {nomic-3.5.2 → nomic-3.8.0}/nomic/pl_callbacks/pl_callback.py +0 -0
- {nomic-3.5.2 → nomic-3.8.0}/nomic/settings.py +0 -0
- {nomic-3.5.2 → nomic-3.8.0}/nomic/utils.py +0 -0
- {nomic-3.5.2 → nomic-3.8.0}/nomic.egg-info/dependency_links.txt +0 -0
- {nomic-3.5.2 → nomic-3.8.0}/nomic.egg-info/entry_points.txt +0 -0
- {nomic-3.5.2 → nomic-3.8.0}/nomic.egg-info/top_level.txt +0 -0
- {nomic-3.5.2 → nomic-3.8.0}/pyproject.toml +0 -0
- {nomic-3.5.2 → nomic-3.8.0}/setup.cfg +0 -0
nomic-3.8.0/MANIFEST.in
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
include nomic/py.typed
|
|
@@ -53,7 +53,7 @@ def login(token, tenant="production", domain=None):
|
|
|
53
53
|
console.print("Authenticate with the Nomic API", style=style, justify="center")
|
|
54
54
|
console.print(auth0_auth_endpoint, style=style, justify="center")
|
|
55
55
|
console.print(
|
|
56
|
-
"Click the above link to retrieve your access token and then run `nomic login [token]`",
|
|
56
|
+
"Click the above link to retrieve your access token and then run `nomic login \\[token]`",
|
|
57
57
|
style=style,
|
|
58
58
|
justify="center",
|
|
59
59
|
)
|
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import time
|
|
4
|
+
from collections.abc import Sequence
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from enum import Enum, auto
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Generic, Literal, TypeVar, overload
|
|
9
|
+
from urllib.parse import urlparse
|
|
10
|
+
|
|
11
|
+
import jsonschema
|
|
12
|
+
import requests
|
|
13
|
+
|
|
14
|
+
from nomic.dataset import AtlasClass
|
|
15
|
+
|
|
16
|
+
from .client_models import (
|
|
17
|
+
ContentExtractionMode,
|
|
18
|
+
ExtractOptions,
|
|
19
|
+
ExtractRequest,
|
|
20
|
+
FigureSummaryOptions,
|
|
21
|
+
OcrLanguage,
|
|
22
|
+
ParseOptions,
|
|
23
|
+
ParseRequest,
|
|
24
|
+
TableSummaryOptions,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"NomicClient",
|
|
29
|
+
"PlatformTask",
|
|
30
|
+
"TaskFailed",
|
|
31
|
+
"TaskPending",
|
|
32
|
+
"UploadedFile",
|
|
33
|
+
# Client models
|
|
34
|
+
"ContentExtractionMode",
|
|
35
|
+
"ExtractOptions",
|
|
36
|
+
"FigureSummaryOptions",
|
|
37
|
+
"OcrLanguage",
|
|
38
|
+
"ParseOptions",
|
|
39
|
+
"TableSummaryOptions",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
MAX_FAILRESP_LENGTH = 1_000 # chars
|
|
43
|
+
|
|
44
|
+
T = TypeVar("T")
|
|
45
|
+
|
|
46
|
+
client: "AtlasClass | None" = None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_client():
|
|
50
|
+
global client
|
|
51
|
+
if client is None:
|
|
52
|
+
client = AtlasClass()
|
|
53
|
+
return client
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass(frozen=True)
|
|
57
|
+
class UploadedFile:
|
|
58
|
+
url: str
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class Sentinel(Enum):
|
|
62
|
+
Nothing = auto()
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class TaskPending(Exception):
|
|
66
|
+
pass
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class TaskFailed(Exception):
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class PlatformTask(Generic[T]):
|
|
74
|
+
"""
|
|
75
|
+
An object representing a task on the Nomic Platform.
|
|
76
|
+
|
|
77
|
+
Attributes:
|
|
78
|
+
id: The ID of the task.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
_id: str
|
|
82
|
+
_result: "T | Sentinel"
|
|
83
|
+
|
|
84
|
+
def __init__(self, id: str):
|
|
85
|
+
self._id = id
|
|
86
|
+
self._result = Sentinel.Nothing
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def id(self) -> str:
|
|
90
|
+
return self._id
|
|
91
|
+
|
|
92
|
+
def get(self, timeout: "float | None" = None, *, block: bool = True) -> T:
|
|
93
|
+
"""
|
|
94
|
+
Waits for the task to complete and returns the result.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
timeout: The maximum time to wait for the task to complete.
|
|
98
|
+
block: Whether to block until the task is complete.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
The result of the task.
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
TaskPending: If the task is not complete and block is True.
|
|
105
|
+
TaskFailed: If the task fails.
|
|
106
|
+
"""
|
|
107
|
+
if self._result is not Sentinel.Nothing:
|
|
108
|
+
return self._result
|
|
109
|
+
client = get_client()
|
|
110
|
+
start_time = time.time()
|
|
111
|
+
while True:
|
|
112
|
+
response = client._get(f"/v1/status/{self._id}")
|
|
113
|
+
raise_for_status_with_body(response)
|
|
114
|
+
status_resp = response.json()
|
|
115
|
+
if status_resp["status"] == "COMPLETED":
|
|
116
|
+
break
|
|
117
|
+
if status_resp["status"] == "FAILED":
|
|
118
|
+
raise TaskFailed(status_resp["error"])
|
|
119
|
+
if not block:
|
|
120
|
+
raise TaskPending
|
|
121
|
+
sleeptime = 1 # poll interval
|
|
122
|
+
if timeout is not None:
|
|
123
|
+
end_time = start_time + timeout
|
|
124
|
+
if end_time < (now := time.time()):
|
|
125
|
+
raise TaskPending
|
|
126
|
+
sleeptime = min(sleeptime, end_time - now)
|
|
127
|
+
time.sleep(sleeptime)
|
|
128
|
+
|
|
129
|
+
completed_response = requests.get(status_resp["result_url"])
|
|
130
|
+
raise_for_status_with_body(completed_response)
|
|
131
|
+
|
|
132
|
+
result = status_resp.pop("result", {})
|
|
133
|
+
result.pop("result_url", None)
|
|
134
|
+
result.pop("result", None)
|
|
135
|
+
result["result"] = completed_response.json()
|
|
136
|
+
result["result"].pop("status", None)
|
|
137
|
+
result["result"].pop("error", None)
|
|
138
|
+
self._result = result
|
|
139
|
+
return result
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class NomicClient:
|
|
143
|
+
"""Client for the Nomic Platform API."""
|
|
144
|
+
|
|
145
|
+
def upload_file(self, path: "str | os.PathLike[str]") -> UploadedFile:
|
|
146
|
+
"""
|
|
147
|
+
Uploads a file to the Nomic Platform.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
path: The path to the PDF file to upload.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
An UploadedFile object representing the uploaded file.
|
|
154
|
+
"""
|
|
155
|
+
client = get_client()
|
|
156
|
+
|
|
157
|
+
path = Path(path)
|
|
158
|
+
|
|
159
|
+
with path.open("rb") as pdf_file:
|
|
160
|
+
file_type = path.suffix.lower()
|
|
161
|
+
if file_type == ".pdf":
|
|
162
|
+
content_type = "application/pdf"
|
|
163
|
+
# elif file_type == ".docx":
|
|
164
|
+
# content_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
165
|
+
# elif file_type == ".doc":
|
|
166
|
+
# content_type = "application/msword"
|
|
167
|
+
# elif file_type == ".txt":
|
|
168
|
+
# content_type = "text/plain"
|
|
169
|
+
# elif file_type == ".pptx":
|
|
170
|
+
# content_type = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
|
171
|
+
# elif file_type == ".ppt":
|
|
172
|
+
# content_type = "application/vnd.ms-powerpoint"
|
|
173
|
+
# elif file_type == ".csv":
|
|
174
|
+
# content_type = "text/csv"
|
|
175
|
+
# elif file_type == ".xlsx":
|
|
176
|
+
# content_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
177
|
+
# elif file_type == ".xls":
|
|
178
|
+
# content_type = "application/vnd.ms-excel"
|
|
179
|
+
else:
|
|
180
|
+
raise ValueError(f"Unsupported file type: {file_type}")
|
|
181
|
+
|
|
182
|
+
response = client._post(
|
|
183
|
+
"/v1/upload",
|
|
184
|
+
json=dict(files=[{"id": path.name, "size": path.stat().st_size, "content_type": content_type}]),
|
|
185
|
+
)
|
|
186
|
+
raise_for_status_with_body(response)
|
|
187
|
+
|
|
188
|
+
values = response.json()
|
|
189
|
+
|
|
190
|
+
# Extract from the files array
|
|
191
|
+
file_info = values["files"][0]
|
|
192
|
+
upload_url = file_info["upload_url"]
|
|
193
|
+
nomic_url = file_info["nomic_url"]
|
|
194
|
+
|
|
195
|
+
# upload the file to the designated pre-signed url
|
|
196
|
+
resp = requests.put(upload_url, data=pdf_file, headers={"x-amz-server-side-encryption": "AES256"})
|
|
197
|
+
|
|
198
|
+
raise_for_status_with_body(resp)
|
|
199
|
+
return UploadedFile(url=nomic_url)
|
|
200
|
+
|
|
201
|
+
@overload
|
|
202
|
+
def parse(
|
|
203
|
+
self, file: "str | UploadedFile", *, options: "ParseOptions | None" = ..., block: Literal[True] = ...
|
|
204
|
+
) -> "dict[str, Any]": ...
|
|
205
|
+
@overload
|
|
206
|
+
def parse(
|
|
207
|
+
self, file: "str | UploadedFile", *, options: "ParseOptions | None" = ..., block: Literal[False]
|
|
208
|
+
) -> PlatformTask["dict[str, Any]"]: ...
|
|
209
|
+
@overload
|
|
210
|
+
def parse(self, file: "str | UploadedFile", *, options: "ParseOptions | None" = ..., block: bool) -> Any: ...
|
|
211
|
+
|
|
212
|
+
def parse(self, file: "str | UploadedFile", *, options: "ParseOptions | None" = None, block: bool = True) -> Any:
|
|
213
|
+
"""
|
|
214
|
+
Parses a document into a structured JSON representation.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
file: The file to parse. Can be a string URL or an UploadedFile object.
|
|
218
|
+
options: Optional ParseOptions to customize parsing behavior (OCR settings, etc.).
|
|
219
|
+
block: Whether to block until the task is complete.
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
By default, returns the parsed document. If block is False, returns a PlatformTask that can be used to get
|
|
223
|
+
the result.
|
|
224
|
+
|
|
225
|
+
Raises:
|
|
226
|
+
ValueError: If an invalid URL is passed.
|
|
227
|
+
TaskFailed: If block is True and the task fails.
|
|
228
|
+
|
|
229
|
+
Example:
|
|
230
|
+
Complete end-to-end workflow with upload and parsing:
|
|
231
|
+
|
|
232
|
+
```python
|
|
233
|
+
from nomic.client import NomicClient
|
|
234
|
+
|
|
235
|
+
# Upload a PDF file
|
|
236
|
+
client = NomicClient()
|
|
237
|
+
file = client.upload_file("my_document.pdf")
|
|
238
|
+
|
|
239
|
+
# Parse the document
|
|
240
|
+
result = client.parse(file)
|
|
241
|
+
print(result)
|
|
242
|
+
```
|
|
243
|
+
"""
|
|
244
|
+
client = get_client()
|
|
245
|
+
|
|
246
|
+
file_url = self._file_to_url(file)
|
|
247
|
+
if options is not None:
|
|
248
|
+
request = ParseRequest(file_url=file_url, options=options)
|
|
249
|
+
else:
|
|
250
|
+
request = ParseRequest(file_url=file_url)
|
|
251
|
+
payload = request.model_dump(mode="json", exclude_unset=True)
|
|
252
|
+
|
|
253
|
+
response = client._post("/v1/parse", json=payload)
|
|
254
|
+
raise_for_status_with_body(response)
|
|
255
|
+
task = PlatformTask(response.json()["task_id"])
|
|
256
|
+
if block:
|
|
257
|
+
return task.get()
|
|
258
|
+
return task
|
|
259
|
+
|
|
260
|
+
@overload
|
|
261
|
+
def extract(
|
|
262
|
+
self,
|
|
263
|
+
files: "str | UploadedFile | Sequence[str | UploadedFile]",
|
|
264
|
+
schema: "dict[str, Any]",
|
|
265
|
+
*,
|
|
266
|
+
options: "ExtractOptions | None" = ...,
|
|
267
|
+
block: Literal[True] = ...,
|
|
268
|
+
) -> Any: ...
|
|
269
|
+
@overload
|
|
270
|
+
def extract(
|
|
271
|
+
self,
|
|
272
|
+
files: "str | UploadedFile | Sequence[str | UploadedFile]",
|
|
273
|
+
schema: "dict[str, Any]",
|
|
274
|
+
*,
|
|
275
|
+
options: "ExtractOptions | None" = ...,
|
|
276
|
+
block: Literal[False],
|
|
277
|
+
) -> PlatformTask[Any]: ...
|
|
278
|
+
@overload
|
|
279
|
+
def extract(
|
|
280
|
+
self,
|
|
281
|
+
files: "str | UploadedFile | Sequence[str | UploadedFile]",
|
|
282
|
+
schema: "dict[str, Any]",
|
|
283
|
+
*,
|
|
284
|
+
options: "ExtractOptions | None" = ...,
|
|
285
|
+
block: bool,
|
|
286
|
+
) -> Any: ...
|
|
287
|
+
|
|
288
|
+
def extract(
|
|
289
|
+
self,
|
|
290
|
+
files: "str | UploadedFile | Sequence[str | UploadedFile]",
|
|
291
|
+
schema: "dict[str, Any]",
|
|
292
|
+
*,
|
|
293
|
+
options: "ExtractOptions | None" = None,
|
|
294
|
+
block: bool = True,
|
|
295
|
+
) -> Any:
|
|
296
|
+
"""
|
|
297
|
+
Extracts structured data from documents.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
files: List of uploaded files to extract from.
|
|
301
|
+
schema: A JSON schema defining the structure of data to extract.
|
|
302
|
+
options: Optional ExtractOptions to customize extraction behavior (system prompt, etc.).
|
|
303
|
+
block: Whether to block until the task is complete.
|
|
304
|
+
|
|
305
|
+
Returns:
|
|
306
|
+
By default, returns the extracted data matching the provided schema. If block is False, returns a PlatformTask
|
|
307
|
+
that can be used to get the result.
|
|
308
|
+
|
|
309
|
+
Raises:
|
|
310
|
+
ValueError: If an invalid URL is passed.
|
|
311
|
+
TaskFailed: If block is True and the task fails.
|
|
312
|
+
|
|
313
|
+
Example:
|
|
314
|
+
Complete end-to-end workflow with upload and extraction:
|
|
315
|
+
|
|
316
|
+
```python
|
|
317
|
+
from nomic.client import NomicClient
|
|
318
|
+
|
|
319
|
+
# Upload a PDF file
|
|
320
|
+
client = NomicClient()
|
|
321
|
+
file = client.upload_file("my_document.pdf")
|
|
322
|
+
|
|
323
|
+
# Define extraction schema
|
|
324
|
+
schema = {
|
|
325
|
+
"type": "array",
|
|
326
|
+
"items": {
|
|
327
|
+
"type": "object",
|
|
328
|
+
"properties": {
|
|
329
|
+
"speaker": {"type": "string"},
|
|
330
|
+
"content": {"type": "string"},
|
|
331
|
+
}
|
|
332
|
+
},
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
# Extract structured data
|
|
336
|
+
result = client.extract(file, schema)
|
|
337
|
+
print(result)
|
|
338
|
+
```
|
|
339
|
+
"""
|
|
340
|
+
jsonschema.Draft7Validator.check_schema(schema)
|
|
341
|
+
|
|
342
|
+
if isinstance(files, (str, UploadedFile)):
|
|
343
|
+
files = [files]
|
|
344
|
+
|
|
345
|
+
client = get_client()
|
|
346
|
+
|
|
347
|
+
request = ExtractRequest(
|
|
348
|
+
file_urls=list(map(self._file_to_url, files)),
|
|
349
|
+
extraction_schema=schema,
|
|
350
|
+
system_prompt=options.system_prompt if options is not None else None,
|
|
351
|
+
)
|
|
352
|
+
payload = request.model_dump(mode="json", exclude_unset=True)
|
|
353
|
+
|
|
354
|
+
response = client._post("/v1/extract", json=payload)
|
|
355
|
+
raise_for_status_with_body(response)
|
|
356
|
+
task = PlatformTask(response.json()["task_id"])
|
|
357
|
+
if block:
|
|
358
|
+
return task.get()
|
|
359
|
+
return task
|
|
360
|
+
|
|
361
|
+
@staticmethod
|
|
362
|
+
def _file_to_url(file: "str | UploadedFile") -> str:
|
|
363
|
+
if isinstance(file, UploadedFile):
|
|
364
|
+
return file.url
|
|
365
|
+
parsed = urlparse(file)
|
|
366
|
+
if parsed.scheme in ("nomic", "http", "https"):
|
|
367
|
+
return file
|
|
368
|
+
if parsed.scheme == "file" or (not parsed.scheme and Path(file).exists()):
|
|
369
|
+
raise ValueError(
|
|
370
|
+
f"Cannot directly pass local file to platform: {file!r}\nPlease use upload_file() to upload it first."
|
|
371
|
+
)
|
|
372
|
+
if not parsed.scheme:
|
|
373
|
+
raise ValueError(f"Invalid URL: {file!r}")
|
|
374
|
+
raise ValueError(f"Unsupported scheme {parsed.scheme!r} for URL {file!r}")
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def raise_for_status_with_body(resp: requests.Response) -> None:
|
|
378
|
+
"""
|
|
379
|
+
Raises HTTPError if the response is not successful.
|
|
380
|
+
|
|
381
|
+
Like Response.raise_for_status, but includes the (truncated) response body in the
|
|
382
|
+
exception message for improved diagnostics.
|
|
383
|
+
|
|
384
|
+
Args:
|
|
385
|
+
resp: The response to check
|
|
386
|
+
|
|
387
|
+
Raises:
|
|
388
|
+
requests.HTTPError: If the response is not successful
|
|
389
|
+
"""
|
|
390
|
+
http_error_msg = ""
|
|
391
|
+
if isinstance(resp.reason, bytes):
|
|
392
|
+
# We attempt to decode utf-8 first because some servers
|
|
393
|
+
# choose to localize their reason strings. If the string
|
|
394
|
+
# isn't utf-8, we fall back to iso-8859-1 for all other
|
|
395
|
+
# encodings. (See PR #3538)
|
|
396
|
+
try:
|
|
397
|
+
reason = resp.reason.decode("utf-8")
|
|
398
|
+
except UnicodeDecodeError:
|
|
399
|
+
reason = resp.reason.decode("iso-8859-1")
|
|
400
|
+
else:
|
|
401
|
+
reason = resp.reason
|
|
402
|
+
|
|
403
|
+
if 400 <= resp.status_code < 500:
|
|
404
|
+
http_error_msg = f"{resp.status_code} Client Error: {reason} for url: {resp.url}"
|
|
405
|
+
|
|
406
|
+
elif 500 <= resp.status_code < 600:
|
|
407
|
+
http_error_msg = f"{resp.status_code} Server Error: {reason} for url: {resp.url}"
|
|
408
|
+
|
|
409
|
+
if http_error_msg:
|
|
410
|
+
if (ctype := resp.headers.get("content-type")) is not None:
|
|
411
|
+
http_error_msg += f"\nContent-Type: {ctype}"
|
|
412
|
+
http_error_msg += f"\nBody: {format_body(resp)}"
|
|
413
|
+
raise requests.HTTPError(http_error_msg, response=resp)
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def format_body(resp: requests.Response) -> str:
|
|
417
|
+
text = None
|
|
418
|
+
if (ctype := resp.headers.get("content-type")) and "application/json" in ctype.lower():
|
|
419
|
+
try:
|
|
420
|
+
data = resp.json()
|
|
421
|
+
text = json.dumps(data, indent=2, ensure_ascii=False)
|
|
422
|
+
except Exception:
|
|
423
|
+
pass
|
|
424
|
+
if text is None:
|
|
425
|
+
text = resp.text
|
|
426
|
+
|
|
427
|
+
limit = MAX_FAILRESP_LENGTH
|
|
428
|
+
if len(text) > limit:
|
|
429
|
+
return text[:limit] + f"\n… [truncated {len(text) - limit} chars]"
|
|
430
|
+
return text
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""Client-side models for the Nomic Platform API."""
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"ContentExtractionMode",
|
|
9
|
+
"OcrLanguage",
|
|
10
|
+
"TableSummaryOptions",
|
|
11
|
+
"FigureSummaryOptions",
|
|
12
|
+
"ParseOptions",
|
|
13
|
+
"ExtractOptions",
|
|
14
|
+
"ParseRequest",
|
|
15
|
+
"ExtractRequest",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ContentExtractionMode(str, Enum):
|
|
20
|
+
"""The overall strategy for extracting content from the document."""
|
|
21
|
+
|
|
22
|
+
Metadata = "metadata" # Disable all OCR. Only use embedded document text.
|
|
23
|
+
Hybrid = "hybrid" # Use a VLM for tables, and run an OCR model on all bitmaps found in the document.
|
|
24
|
+
Ocr = "ocr" # Use a VLM for tables. Run an OCR model on full pages.
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class OcrLanguage(str, Enum):
|
|
28
|
+
"""Language selection for OCR."""
|
|
29
|
+
|
|
30
|
+
English = "en"
|
|
31
|
+
Latin = "latin"
|
|
32
|
+
Chinese_Japanese_English = "zh_ja_en"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class TableSummaryOptions(BaseModel):
|
|
36
|
+
"""Options for generating table summaries."""
|
|
37
|
+
|
|
38
|
+
enabled: bool = Field(
|
|
39
|
+
default=False,
|
|
40
|
+
description="Whether to generate a summary of table content",
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class FigureSummaryOptions(BaseModel):
|
|
45
|
+
"""Options for generating figure summaries."""
|
|
46
|
+
|
|
47
|
+
enabled: bool = Field(
|
|
48
|
+
default=True,
|
|
49
|
+
description="Whether to generate a summary of figure content",
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class ParseOptions(BaseModel):
|
|
54
|
+
"""Options to customize document parsing."""
|
|
55
|
+
|
|
56
|
+
content_extraction_mode: ContentExtractionMode = Field(
|
|
57
|
+
default=ContentExtractionMode.Hybrid,
|
|
58
|
+
description="The overall strategy for extracting content from the document",
|
|
59
|
+
)
|
|
60
|
+
ocr_language: OcrLanguage = Field(
|
|
61
|
+
default=OcrLanguage.English,
|
|
62
|
+
description="Language selection for OCR",
|
|
63
|
+
)
|
|
64
|
+
table_summary: TableSummaryOptions | None = Field(
|
|
65
|
+
default=None,
|
|
66
|
+
description="Options for generating table summaries",
|
|
67
|
+
)
|
|
68
|
+
figure_summary: FigureSummaryOptions | None = Field(
|
|
69
|
+
default=None,
|
|
70
|
+
description="Options for generating figure summaries",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class ExtractOptions(BaseModel):
|
|
75
|
+
"""Options to customize document extraction."""
|
|
76
|
+
|
|
77
|
+
system_prompt: str | None = Field(
|
|
78
|
+
default=None,
|
|
79
|
+
description="Custom system prompt to guide the AI extraction process across the entire file. "
|
|
80
|
+
"Use this to provide specific instructions, context, or constraints for how information "
|
|
81
|
+
"should be extracted and formatted according to your requirements.",
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class ParseRequest(BaseModel):
|
|
86
|
+
"""Request model for parsing a document."""
|
|
87
|
+
|
|
88
|
+
file_url: str = Field(description="File URL to process")
|
|
89
|
+
options: ParseOptions = Field(
|
|
90
|
+
default_factory=ParseOptions,
|
|
91
|
+
description="Options to customize document parsing",
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class ExtractRequest(BaseModel):
|
|
96
|
+
"""Request model for extracting data from documents."""
|
|
97
|
+
|
|
98
|
+
file_urls: list[str] = Field(description="List of file URLs to process for extraction")
|
|
99
|
+
extraction_schema: dict = Field(description="JSON schema defining the structure of data to extract")
|
|
100
|
+
system_prompt: str | None = Field(
|
|
101
|
+
default=None,
|
|
102
|
+
description="Custom system prompt to guide the AI extraction process",
|
|
103
|
+
)
|
|
@@ -8,11 +8,12 @@ import os
|
|
|
8
8
|
import re
|
|
9
9
|
import time
|
|
10
10
|
import unicodedata
|
|
11
|
+
from collections.abc import Iterable, Mapping
|
|
11
12
|
from contextlib import contextmanager
|
|
12
13
|
from datetime import datetime
|
|
13
14
|
from io import BytesIO
|
|
14
15
|
from pathlib import Path
|
|
15
|
-
from typing import Dict, List, Optional, Tuple, Union
|
|
16
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Protocol, Tuple, TypeVar, Union
|
|
16
17
|
|
|
17
18
|
import numpy as np
|
|
18
19
|
import pandas as pd
|
|
@@ -37,6 +38,20 @@ from .data_operations import AtlasMapData, AtlasMapDuplicates, AtlasMapEmbedding
|
|
|
37
38
|
from .settings import *
|
|
38
39
|
from .utils import assert_valid_project_id, download_feather
|
|
39
40
|
|
|
41
|
+
if TYPE_CHECKING:
|
|
42
|
+
from typing_extensions import TypeAlias
|
|
43
|
+
|
|
44
|
+
T_co = TypeVar("T_co", covariant=True)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class SupportsRead(Protocol[T_co]):
|
|
48
|
+
def read(self, length: int = ..., /) -> T_co: ...
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
_Data: "TypeAlias" = (
|
|
52
|
+
"Iterable[bytes] | str | bytes | SupportsRead[str | bytes] | list[tuple[Any, Any]] | tuple[tuple[Any, Any], ...] | Mapping[Any, Any]"
|
|
53
|
+
)
|
|
54
|
+
|
|
40
55
|
|
|
41
56
|
class AtlasUser:
|
|
42
57
|
def __init__(self):
|
|
@@ -410,6 +425,30 @@ class AtlasClass(object):
|
|
|
410
425
|
|
|
411
426
|
return organization_slug, organization_id
|
|
412
427
|
|
|
428
|
+
def _post(self, endpoint: str, *, json: "Any | None" = None) -> requests.Response:
|
|
429
|
+
response = requests.post(
|
|
430
|
+
self.atlas_api_path + endpoint,
|
|
431
|
+
headers=self.header,
|
|
432
|
+
json=json,
|
|
433
|
+
)
|
|
434
|
+
return response
|
|
435
|
+
|
|
436
|
+
def _get(self, endpoint: str) -> requests.Response:
|
|
437
|
+
response = requests.get(
|
|
438
|
+
self.atlas_api_path + endpoint,
|
|
439
|
+
headers=self.header,
|
|
440
|
+
)
|
|
441
|
+
return response
|
|
442
|
+
|
|
443
|
+
def _put(self, endpoint: str, *, data: "_Data | None" = None, json: "Any | None" = None) -> requests.Response:
|
|
444
|
+
response = requests.put(
|
|
445
|
+
self.atlas_api_path + endpoint,
|
|
446
|
+
headers=self.header,
|
|
447
|
+
data=data,
|
|
448
|
+
json=json,
|
|
449
|
+
)
|
|
450
|
+
return response
|
|
451
|
+
|
|
413
452
|
|
|
414
453
|
class AtlasIndex:
|
|
415
454
|
"""
|
|
File without changes
|
|
@@ -1,13 +1,17 @@
|
|
|
1
|
+
MANIFEST.in
|
|
1
2
|
README.md
|
|
2
3
|
pyproject.toml
|
|
3
4
|
setup.py
|
|
4
5
|
nomic/__init__.py
|
|
5
6
|
nomic/atlas.py
|
|
6
7
|
nomic/cli.py
|
|
8
|
+
nomic/client.py
|
|
9
|
+
nomic/client_models.py
|
|
7
10
|
nomic/data_inference.py
|
|
8
11
|
nomic/data_operations.py
|
|
9
12
|
nomic/dataset.py
|
|
10
13
|
nomic/embed.py
|
|
14
|
+
nomic/py.typed
|
|
11
15
|
nomic/settings.py
|
|
12
16
|
nomic/utils.py
|
|
13
17
|
nomic.egg-info/PKG-INFO
|
|
@@ -23,7 +23,7 @@ with open("README.md") as f:
|
|
|
23
23
|
|
|
24
24
|
setup(
|
|
25
25
|
name="nomic",
|
|
26
|
-
version="3.
|
|
26
|
+
version="3.8.0",
|
|
27
27
|
url="https://github.com/nomic-ai/nomic",
|
|
28
28
|
description=description,
|
|
29
29
|
long_description=long_description,
|
|
@@ -35,15 +35,17 @@ setup(
|
|
|
35
35
|
"License :: OSI Approved :: Apache Software License",
|
|
36
36
|
"Programming Language :: Python :: 3",
|
|
37
37
|
],
|
|
38
|
+
requires_python='>=3.10',
|
|
38
39
|
install_requires=[
|
|
39
40
|
"click",
|
|
40
41
|
"jsonlines",
|
|
42
|
+
"jsonschema>=4.23.0,<5",
|
|
41
43
|
"loguru",
|
|
42
44
|
"rich",
|
|
43
45
|
"requests",
|
|
44
46
|
"numpy",
|
|
45
47
|
"pandas",
|
|
46
|
-
"pydantic",
|
|
48
|
+
"pydantic>=2,<3",
|
|
47
49
|
"tqdm",
|
|
48
50
|
"pyarrow",
|
|
49
51
|
"pillow",
|
nomic-3.5.2/nomic/__init__.py
DELETED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|