indexify 0.0.20__tar.gz → 0.0.22__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {indexify-0.0.20 → indexify-0.0.22}/PKG-INFO +1 -1
- {indexify-0.0.20 → indexify-0.0.22}/indexify/__init__.py +3 -2
- {indexify-0.0.20 → indexify-0.0.22}/indexify/client.py +151 -137
- indexify-0.0.22/indexify/error.py +30 -0
- indexify-0.0.22/indexify/extraction_policy.py +68 -0
- {indexify-0.0.20 → indexify-0.0.22}/indexify/extractor.py +6 -1
- indexify-0.0.22/indexify/settings.py +2 -0
- {indexify-0.0.20 → indexify-0.0.22}/pyproject.toml +1 -1
- indexify-0.0.20/indexify/extraction_policy.py +0 -28
- indexify-0.0.20/indexify/settings.py +0 -2
- {indexify-0.0.20 → indexify-0.0.22}/LICENSE.txt +0 -0
- {indexify-0.0.20 → indexify-0.0.22}/README.md +0 -0
- {indexify-0.0.20 → indexify-0.0.22}/indexify/data_containers.py +0 -0
- {indexify-0.0.20 → indexify-0.0.22}/indexify/exceptions.py +0 -0
- {indexify-0.0.20 → indexify-0.0.22}/indexify/index.py +0 -0
- {indexify-0.0.20 → indexify-0.0.22}/indexify/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
from .index import Index
|
2
2
|
from .client import IndexifyClient
|
3
|
-
from .extraction_policy import ExtractionPolicy
|
3
|
+
from .extraction_policy import ExtractionPolicy, ExtractionGraphBuilder, ExtractionGraph
|
4
4
|
from .client import IndexifyClient, Document
|
5
5
|
from .settings import DEFAULT_SERVICE_URL
|
6
6
|
|
@@ -8,6 +8,7 @@ __all__ = [
|
|
8
8
|
"Index",
|
9
9
|
"Document",
|
10
10
|
"IndexifyClient",
|
11
|
-
"
|
11
|
+
"ExtractionGraph",
|
12
|
+
"ExtractionGraphBuilder" "ExtractionPolicy",
|
12
13
|
"DEFAULT_SERVICE_URL",
|
13
14
|
]
|
@@ -6,19 +6,20 @@ import json
|
|
6
6
|
from collections import namedtuple
|
7
7
|
from .settings import DEFAULT_SERVICE_URL
|
8
8
|
from .extractor import Extractor
|
9
|
-
from .extraction_policy import ExtractionPolicy
|
9
|
+
from .extraction_policy import ExtractionPolicy, ExtractionGraph
|
10
10
|
from .index import Index
|
11
11
|
from .utils import json_set_default
|
12
|
+
from .error import Error
|
12
13
|
from .data_containers import TextChunk
|
13
14
|
from indexify.exceptions import ApiException
|
14
15
|
from dataclasses import dataclass
|
15
|
-
|
16
16
|
from typing import List, Optional, Union, Dict
|
17
17
|
|
18
18
|
Document = namedtuple("Document", ["text", "labels", "id"])
|
19
19
|
|
20
20
|
SQLQueryRow = namedtuple("SQLQueryRow", ["content_id", "data"])
|
21
21
|
|
22
|
+
|
22
23
|
@dataclass
|
23
24
|
class SqlQueryResult:
|
24
25
|
result: List[Dict]
|
@@ -45,22 +46,22 @@ class IndexifyClient:
|
|
45
46
|
|
46
47
|
def __init__(
|
47
48
|
self,
|
48
|
-
service_url: str = DEFAULT_SERVICE_URL,
|
49
|
+
service_url: str = DEFAULT_SERVICE_URL, # switch this to DEFAULT_SERVICE_URL_HTTPS for TLS
|
49
50
|
namespace: str = "default",
|
50
51
|
config_path: Optional[str] = None,
|
51
52
|
*args,
|
52
53
|
**kwargs,
|
53
54
|
):
|
54
55
|
if config_path:
|
55
|
-
with open(config_path,
|
56
|
+
with open(config_path, "r") as file:
|
56
57
|
config = yaml.safe_load(file)
|
57
|
-
|
58
|
-
if config.get(
|
59
|
-
tls_config = config[
|
58
|
+
|
59
|
+
if config.get("use_tls", False):
|
60
|
+
tls_config = config["tls_config"]
|
60
61
|
self._client = httpx.Client(
|
61
62
|
http2=True,
|
62
|
-
cert=(tls_config[
|
63
|
-
verify=tls_config.get(
|
63
|
+
cert=(tls_config["cert_path"], tls_config["key_path"]),
|
64
|
+
verify=tls_config.get("ca_bundle_path", True),
|
64
65
|
)
|
65
66
|
else:
|
66
67
|
self._client = httpx.Client(*args, **kwargs)
|
@@ -68,17 +69,13 @@ class IndexifyClient:
|
|
68
69
|
self._client = httpx.Client(*args, **kwargs)
|
69
70
|
|
70
71
|
self.namespace: str = namespace
|
71
|
-
self.
|
72
|
+
self.extraction_graphs: List[ExtractionGraph] = []
|
72
73
|
self.labels: dict = {}
|
73
74
|
self._service_url = service_url
|
75
|
+
self._timeout = kwargs.get("timeout")
|
74
76
|
|
75
77
|
# get namespace data
|
76
|
-
|
77
|
-
response.raise_for_status()
|
78
|
-
resp_json = response.json()
|
79
|
-
# initialize extraction_policies
|
80
|
-
for eb in resp_json["namespace"]["extraction_policies"]:
|
81
|
-
self.extraction_policies.append(ExtractionPolicy.from_dict(eb))
|
78
|
+
self.extraction_graphs = self.get_extraction_graphs()
|
82
79
|
|
83
80
|
@classmethod
|
84
81
|
def with_mtls(
|
@@ -128,12 +125,18 @@ class IndexifyClient:
|
|
128
125
|
return client
|
129
126
|
|
130
127
|
def _request(self, method: str, **kwargs) -> httpx.Response:
|
131
|
-
response = self._client.request(method,timeout=None, **kwargs)
|
132
128
|
try:
|
133
|
-
response.
|
134
|
-
|
135
|
-
|
136
|
-
|
129
|
+
response = self._client.request(method, timeout=self._timeout, **kwargs)
|
130
|
+
status_code = str(response.status_code)
|
131
|
+
if status_code.startswith("4") or status_code.startswith("5"):
|
132
|
+
error = Error.from_tonic_error_string(str(response.url), response.text)
|
133
|
+
self.__print_additional_error_context(error)
|
134
|
+
raise error
|
135
|
+
except httpx.ConnectError:
|
136
|
+
message = f"Make sure the server is running and accesible at {self._service_url}"
|
137
|
+
error = Error(status="ConnectionError", message=message)
|
138
|
+
print(error)
|
139
|
+
raise error
|
137
140
|
return response
|
138
141
|
|
139
142
|
def get(self, endpoint: str, **kwargs) -> httpx.Response:
|
@@ -188,7 +191,7 @@ class IndexifyClient:
|
|
188
191
|
```
|
189
192
|
"""
|
190
193
|
return self._request("PUT", url=f"{self._service_url}/{endpoint}", **kwargs)
|
191
|
-
|
194
|
+
|
192
195
|
def delete(self, endpoint: str, **kwargs) -> httpx.Response:
|
193
196
|
"""
|
194
197
|
Make a DELETE request to the Indexify service.
|
@@ -243,9 +246,9 @@ class IndexifyClient:
|
|
243
246
|
def create_namespace(
|
244
247
|
self,
|
245
248
|
namespace: str,
|
246
|
-
|
249
|
+
extraction_graphs: list = [],
|
247
250
|
labels: dict = {},
|
248
|
-
service_url: str = DEFAULT_SERVICE_URL
|
251
|
+
service_url: str = DEFAULT_SERVICE_URL,
|
249
252
|
) -> "IndexifyClient":
|
250
253
|
"""
|
251
254
|
Create a new namespace.
|
@@ -253,16 +256,16 @@ class IndexifyClient:
|
|
253
256
|
Returns:
|
254
257
|
IndexifyClient: a new client with the given namespace
|
255
258
|
"""
|
256
|
-
|
257
|
-
for bd in
|
258
|
-
if isinstance(bd,
|
259
|
-
|
259
|
+
extraction_graphs = []
|
260
|
+
for bd in extraction_graphs:
|
261
|
+
if isinstance(bd, extraction_graphs):
|
262
|
+
extraction_graphs.append(bd.to_dict())
|
260
263
|
else:
|
261
|
-
|
264
|
+
extraction_graphs.append(bd)
|
262
265
|
|
263
266
|
req = {
|
264
267
|
"name": namespace,
|
265
|
-
"
|
268
|
+
"extraction_graphs": extraction_graphs,
|
266
269
|
"labels": labels,
|
267
270
|
}
|
268
271
|
|
@@ -289,7 +292,6 @@ class IndexifyClient:
|
|
289
292
|
List[Index]: list of indexes in the current namespace
|
290
293
|
"""
|
291
294
|
response = self.get(f"namespaces/{self.namespace}/indexes")
|
292
|
-
response.raise_for_status()
|
293
295
|
return response.json()["indexes"]
|
294
296
|
|
295
297
|
def extractors(self) -> List[Extractor]:
|
@@ -306,69 +308,36 @@ class IndexifyClient:
|
|
306
308
|
extractors.append(Extractor.from_dict(ed))
|
307
309
|
return extractors
|
308
310
|
|
309
|
-
def
|
311
|
+
def get_extraction_graphs(self) -> List[ExtractionGraph]:
|
310
312
|
"""
|
311
313
|
Retrieve and update the list of extraction policies for the current namespace.
|
312
314
|
"""
|
313
315
|
response = self.get(f"namespaces/{self.namespace}")
|
314
|
-
response.
|
316
|
+
json = response.json()
|
315
317
|
|
316
|
-
self.
|
317
|
-
for
|
318
|
-
self.
|
319
|
-
return self.extraction_policies
|
318
|
+
self.extraction_graphs = []
|
319
|
+
for graph in json["namespace"]["extraction_graphs"]:
|
320
|
+
self.extraction_graphs.append(ExtractionGraph.from_dict(graph))
|
320
321
|
|
321
|
-
|
322
|
-
self,
|
323
|
-
extractor: str,
|
324
|
-
name: str,
|
325
|
-
input_params: dict = {},
|
326
|
-
labels_eq: str = None,
|
327
|
-
content_source="ingestion",
|
328
|
-
) -> dict:
|
329
|
-
"""Add a new extraction policy.
|
330
|
-
|
331
|
-
Args:
|
332
|
-
- extractor (str): Name of the extractor
|
333
|
-
- name (str): Name for this instance
|
334
|
-
- input_params (dict): Dictionary containing extractor input params
|
335
|
-
- filter (Filter): Optional filter for this extractor
|
336
|
-
|
337
|
-
Returns:
|
338
|
-
dict: response payload
|
339
|
-
|
340
|
-
Examples:
|
341
|
-
>>> repo.add_extraction_policy("EfficientNet", "efficientnet")
|
342
|
-
|
343
|
-
>>> repo.add_extraction_policy("MiniLML6", "minilm")
|
322
|
+
return self.extraction_graphs
|
344
323
|
|
324
|
+
def create_extraction_graph(self, extraction_graph: ExtractionGraph):
|
345
325
|
"""
|
346
|
-
|
347
|
-
"extractor": extractor,
|
348
|
-
"name": name,
|
349
|
-
"input_params": input_params,
|
350
|
-
"filters_eq": labels_eq,
|
351
|
-
"content_source": content_source,
|
352
|
-
}
|
353
|
-
if req["filters_eq"] == None:
|
354
|
-
del req["filters_eq"]
|
326
|
+
Create a new extraction graph.
|
355
327
|
|
328
|
+
Args:
|
329
|
+
- extraction_graph (ExtractionGraph): the extraction graph to create
|
330
|
+
"""
|
331
|
+
req = extraction_graph.to_dict()
|
332
|
+
req["namespace"] = self.namespace
|
356
333
|
request_body = json.dumps(req, default=json_set_default)
|
357
334
|
response = self.post(
|
358
|
-
f"namespaces/{self.namespace}/
|
335
|
+
f"namespaces/{self.namespace}/extraction_graphs",
|
359
336
|
data=request_body,
|
360
337
|
headers={"Content-Type": "application/json"},
|
361
338
|
)
|
362
|
-
|
363
|
-
# update self.extractor_bindings
|
364
|
-
self.get_extraction_policies()
|
365
|
-
|
366
|
-
try:
|
367
|
-
response.raise_for_status()
|
368
|
-
except httpx.HTTPStatusError as exc:
|
369
|
-
raise ApiException(exc.response.text)
|
370
339
|
return
|
371
|
-
|
340
|
+
|
372
341
|
def get_content_metadata(self, content_id: str) -> dict:
|
373
342
|
"""
|
374
343
|
Get metadata for a specific content ID in a given index.
|
@@ -377,52 +346,32 @@ class IndexifyClient:
|
|
377
346
|
- content_id (str): content id to query
|
378
347
|
"""
|
379
348
|
response = self.get(f"namespaces/{self.namespace}/content/{content_id}")
|
380
|
-
response.raise_for_status()
|
381
349
|
return response.json()
|
382
|
-
|
383
|
-
def
|
384
|
-
self,
|
385
|
-
content_id: str = None,
|
386
|
-
):
|
387
|
-
"""
|
388
|
-
Get list of content from current namespace.
|
389
|
-
|
390
|
-
Args:
|
391
|
-
- parent_id (str): Optional filter for parent id
|
392
|
-
- labels_eq (str): Optional filter for labels
|
393
|
-
"""
|
394
|
-
params = {"parent_id": content_id}
|
395
|
-
|
396
|
-
response = self.get(f"namespaces/{self.namespace}/content", params=params)
|
397
|
-
response.raise_for_status()
|
398
|
-
return [
|
399
|
-
self._add_content_url(content)
|
400
|
-
for content in response.json()["content_list"]
|
401
|
-
]
|
402
|
-
|
403
|
-
def download_content(self, id:str) -> bytes:
|
350
|
+
|
351
|
+
def download_content(self, id: str) -> bytes:
|
404
352
|
"""
|
405
353
|
Download content from id. Return bytes
|
406
|
-
|
354
|
+
|
407
355
|
Args:
|
408
356
|
- id (str): id of content to download
|
409
357
|
"""
|
410
358
|
response = self.get(f"namespaces/{self.namespace}/content/{id}/download")
|
411
|
-
|
412
|
-
response.raise_for_status()
|
413
|
-
return response.content
|
414
|
-
except httpx.HTTPStatusError as exc:
|
415
|
-
raise ApiException(exc.response.text)
|
359
|
+
return response.content
|
416
360
|
|
417
361
|
def add_documents(
|
418
|
-
self,
|
419
|
-
|
362
|
+
self,
|
363
|
+
extraction_graphs: Union[str, List[str]],
|
364
|
+
documents: Union[Document, str, List[Union[Document, str]]],
|
365
|
+
doc_id=None,
|
366
|
+
) -> Union[str, List[str]]:
|
420
367
|
"""
|
421
368
|
Add documents to current namespace.
|
422
369
|
|
423
370
|
Args:
|
424
371
|
- documents (Union[Document, str, List[Union[Document, str]]]): this can be a list of strings, list of Documents or a mix of both
|
425
372
|
"""
|
373
|
+
if isinstance(extraction_graphs, str):
|
374
|
+
extraction_graphs = [extraction_graphs]
|
426
375
|
if isinstance(documents, Document):
|
427
376
|
documents = [documents]
|
428
377
|
elif isinstance(documents, str):
|
@@ -433,7 +382,9 @@ class IndexifyClient:
|
|
433
382
|
if isinstance(item, Document):
|
434
383
|
new_documents.append(item)
|
435
384
|
elif isinstance(item, str):
|
436
|
-
new_documents.append(
|
385
|
+
new_documents.append(
|
386
|
+
Document(item, {}, id=None)
|
387
|
+
) # don't pass in id for a string content because doesn't make sense to have same content id for all strings
|
437
388
|
else:
|
438
389
|
raise ValueError(
|
439
390
|
"List items must be either Document instances or strings."
|
@@ -444,13 +395,21 @@ class IndexifyClient:
|
|
444
395
|
"Invalid type for documents. Expected Document, str, or list of these."
|
445
396
|
)
|
446
397
|
|
447
|
-
req = {
|
398
|
+
req = {
|
399
|
+
"documents": [doc._asdict() for doc in documents],
|
400
|
+
"extraction_graph_names": extraction_graphs,
|
401
|
+
}
|
448
402
|
response = self.post(
|
449
403
|
f"namespaces/{self.namespace}/add_texts",
|
450
404
|
json=req,
|
451
405
|
headers={"Content-Type": "application/json"},
|
452
406
|
)
|
453
407
|
response.raise_for_status()
|
408
|
+
response_json = response.json()
|
409
|
+
content_ids = response_json["content_ids"]
|
410
|
+
if len(documents) == 1 and len(content_ids) == 1:
|
411
|
+
return content_ids[0]
|
412
|
+
return content_ids
|
454
413
|
|
455
414
|
def delete_documents(self, document_ids: List[str]) -> None:
|
456
415
|
"""
|
@@ -465,7 +424,6 @@ class IndexifyClient:
|
|
465
424
|
json=req,
|
466
425
|
headers={"Content-Type": "application/json"},
|
467
426
|
)
|
468
|
-
response.raise_for_status()
|
469
427
|
|
470
428
|
def update_content(self, document_id: str, path: str) -> None:
|
471
429
|
"""
|
@@ -475,8 +433,9 @@ class IndexifyClient:
|
|
475
433
|
- path (str): relative path to the file to be uploaded
|
476
434
|
"""
|
477
435
|
with open(path, "rb") as f:
|
478
|
-
response = self.put(
|
479
|
-
|
436
|
+
response = self.put(
|
437
|
+
f"namespaces/{self.namespace}/content/{document_id}", files={"file": f}
|
438
|
+
)
|
480
439
|
|
481
440
|
def get_structured_data(self, content_id: str) -> dict:
|
482
441
|
"""
|
@@ -485,11 +444,14 @@ class IndexifyClient:
|
|
485
444
|
Args:
|
486
445
|
- content_id (str): content id to query
|
487
446
|
"""
|
488
|
-
response = self.get(
|
489
|
-
|
490
|
-
|
447
|
+
response = self.get(
|
448
|
+
f"namespaces/{self.namespace}/content/{content_id}/metadata"
|
449
|
+
)
|
450
|
+
return response.json().get("metadata", [])
|
491
451
|
|
492
|
-
def search_index(
|
452
|
+
def search_index(
|
453
|
+
self, name: str, query: str, top_k: int, filters: List[str] = []
|
454
|
+
) -> list[TextChunk]:
|
493
455
|
"""
|
494
456
|
Search index in the current namespace.
|
495
457
|
|
@@ -505,10 +467,9 @@ class IndexifyClient:
|
|
505
467
|
json=req,
|
506
468
|
headers={"Content-Type": "application/json"},
|
507
469
|
)
|
508
|
-
response.raise_for_status()
|
509
470
|
return response.json()["results"]
|
510
471
|
|
511
|
-
def upload_file(self, path: str, id=None, labels: dict = {}) -> str:
|
472
|
+
def upload_file(self, extraction_graphs: Union[str, List[str]], path: str, id=None, labels: dict = {}) -> str:
|
512
473
|
"""
|
513
474
|
Upload a file.
|
514
475
|
|
@@ -516,9 +477,11 @@ class IndexifyClient:
|
|
516
477
|
- path (str): relative path to the file to be uploaded
|
517
478
|
- labels (dict): labels to be associated with the file
|
518
479
|
"""
|
519
|
-
|
480
|
+
if isinstance(extraction_graphs, str):
|
481
|
+
extraction_graphs = [extraction_graphs]
|
482
|
+
params = {"extraction_graph_names": extraction_graphs}
|
520
483
|
if id is not None:
|
521
|
-
params[
|
484
|
+
params["id"] = id
|
522
485
|
with open(path, "rb") as f:
|
523
486
|
response = self.post(
|
524
487
|
f"namespaces/{self.namespace}/upload_file",
|
@@ -526,7 +489,6 @@ class IndexifyClient:
|
|
526
489
|
data=labels,
|
527
490
|
params=params,
|
528
491
|
)
|
529
|
-
response.raise_for_status()
|
530
492
|
response_json = response.json()
|
531
493
|
return response_json["content_id"]
|
532
494
|
|
@@ -535,20 +497,47 @@ class IndexifyClient:
|
|
535
497
|
List all schemas in the current namespace.
|
536
498
|
"""
|
537
499
|
response = self.get(f"namespaces/{self.namespace}/schemas")
|
538
|
-
response.raise_for_status()
|
539
500
|
return response.json()
|
540
|
-
|
541
|
-
def get_content_tree(self, content_id:str):
|
501
|
+
|
502
|
+
def get_content_tree(self, content_id: str):
|
542
503
|
"""
|
543
504
|
Get content tree for a given content id
|
544
505
|
|
545
506
|
Args:
|
546
507
|
- content_id (str): id of content
|
547
508
|
"""
|
548
|
-
response = self.get(
|
549
|
-
|
509
|
+
response = self.get(
|
510
|
+
f"namespaces/{self.namespace}/content/{content_id}/content-tree"
|
511
|
+
)
|
550
512
|
return response.json()
|
513
|
+
|
514
|
+
def get_extracted_content(self, content_id: str, level: int = 0):
|
515
|
+
"""
|
516
|
+
Get list of child for a given content id and their content up to the specified level.
|
517
|
+
|
518
|
+
Args:
|
519
|
+
- content_id (str): id of content
|
520
|
+
- level (int): depth of content retrieval (default: 0)
|
521
|
+
"""
|
522
|
+
content_tree = self.get_content_tree(content_id)
|
523
|
+
child_list = []
|
524
|
+
|
525
|
+
def traverse_content(parent_id, current_level):
|
526
|
+
if current_level > level:
|
527
|
+
return
|
528
|
+
|
529
|
+
for item in content_tree['content_tree_metadata']:
|
530
|
+
if item['parent_id'] == parent_id:
|
531
|
+
child_id = item['id']
|
532
|
+
content = self.download_content(child_id)
|
533
|
+
child_list.append({'id': child_id, 'content': content})
|
534
|
+
|
535
|
+
traverse_content(child_id, current_level + 1)
|
536
|
+
|
537
|
+
traverse_content(content_id, 0)
|
551
538
|
|
539
|
+
return child_list
|
540
|
+
|
552
541
|
def sql_query(self, query: str):
|
553
542
|
"""
|
554
543
|
Execute a SQL query.
|
@@ -562,24 +551,38 @@ class IndexifyClient:
|
|
562
551
|
json=req,
|
563
552
|
headers={"Content-Type": "application/json"},
|
564
553
|
)
|
565
|
-
response.raise_for_status()
|
566
554
|
result = response.json()
|
567
555
|
rows = []
|
568
556
|
for row in result["rows"]:
|
569
557
|
data = row["data"]
|
570
558
|
rows.append(data)
|
571
559
|
return SqlQueryResult(result=rows)
|
572
|
-
|
573
|
-
def ingest_remote_file(
|
574
|
-
|
560
|
+
|
561
|
+
def ingest_remote_file(
|
562
|
+
self, extraction_graphs: Union[str, List[str]], url: str, mime_type: str, labels: Dict[str, str], id=None
|
563
|
+
):
|
564
|
+
if isinstance(extraction_graphs, str):
|
565
|
+
extraction_graphs = [extraction_graphs]
|
566
|
+
req = {"url": url, "mime_type": mime_type, "labels": labels, "id": id, "extraction_graph_names": extraction_graphs}
|
575
567
|
response = self.post(
|
576
568
|
f"namespaces/{self.namespace}/ingest_remote_file",
|
577
569
|
json=req,
|
578
570
|
headers={"Content-Type": "application/json"},
|
579
571
|
)
|
580
|
-
response.raise_for_status()
|
581
572
|
return response.json()
|
582
573
|
|
574
|
+
def wait_for_extraction(self, content_id: str):
|
575
|
+
"""
|
576
|
+
Wait for extraction to complete for a given content id
|
577
|
+
|
578
|
+
Args:
|
579
|
+
- content_id (str): id of content
|
580
|
+
"""
|
581
|
+
response = self.get(
|
582
|
+
f"namespaces/{self.namespace}/content/{content_id}/wait"
|
583
|
+
)
|
584
|
+
response.raise_for_status()
|
585
|
+
|
583
586
|
def generate_unique_hex_id(self):
|
584
587
|
"""
|
585
588
|
Generate a unique hexadecimal identifier
|
@@ -588,18 +591,29 @@ class IndexifyClient:
|
|
588
591
|
str: a unique hexadecimal string
|
589
592
|
"""
|
590
593
|
return uuid.uuid4().hex[:16]
|
591
|
-
|
594
|
+
|
592
595
|
def generate_hash_from_string(self, input_string: str):
|
593
596
|
"""
|
594
597
|
Generate a hash for the given string and return it as a hexadecimal string.
|
595
|
-
|
598
|
+
|
596
599
|
Args:
|
597
600
|
input_string (str): The input string to hash.
|
598
|
-
|
601
|
+
|
599
602
|
Returns:
|
600
603
|
str: The hexadecimal hash of the input string.
|
601
604
|
"""
|
602
605
|
hash_object = hashlib.sha256(input_string.encode())
|
603
606
|
return hash_object.hexdigest()[:16]
|
604
607
|
|
608
|
+
def __print_additional_error_context(self, error: Error):
|
609
|
+
print(error)
|
610
|
+
|
611
|
+
if error.status == "ExtractionGraphError":
|
612
|
+
graphs = [eg.name for eg in self.extraction_graphs]
|
613
|
+
extractors = [ext.name for ext in self.extractors()]
|
614
|
+
print(f"Available extraction graphs: {graphs}")
|
615
|
+
print(f"Available extractors: {extractors}")
|
605
616
|
|
617
|
+
if error.status == "SearchError":
|
618
|
+
indexes = [index["name"] for index in self.indexes()]
|
619
|
+
print(f"Available indexes: {indexes}")
|
@@ -0,0 +1,30 @@
|
|
1
|
+
class Error(Exception):
|
2
|
+
status: str
|
3
|
+
message: str
|
4
|
+
|
5
|
+
def __init__(self, status: str, message: str):
|
6
|
+
self.status = status
|
7
|
+
self.message = message
|
8
|
+
|
9
|
+
@staticmethod
|
10
|
+
def from_tonic_error_string(url: str, error: str) -> "Error":
|
11
|
+
data = error.split(", ")
|
12
|
+
|
13
|
+
message = data[1].split(": ", 1)[1]
|
14
|
+
if message.startswith('"') and message.endswith('"'):
|
15
|
+
message = message[1:-1]
|
16
|
+
|
17
|
+
status = "GeneralError"
|
18
|
+
if "extraction_graph" in url:
|
19
|
+
status = "ExtractionGraphError"
|
20
|
+
elif "search" in url:
|
21
|
+
status = "SearchError"
|
22
|
+
|
23
|
+
error = Error(status, message)
|
24
|
+
return error
|
25
|
+
|
26
|
+
def __str__(self):
|
27
|
+
return f"{self.status} | {self.message.capitalize()}"
|
28
|
+
|
29
|
+
def __repr__(self):
|
30
|
+
return f"Error(status={self.status!r}, message={self.message!r})"
|
@@ -0,0 +1,68 @@
|
|
1
|
+
from dataclasses import dataclass, asdict
|
2
|
+
from typing import Optional, List
|
3
|
+
|
4
|
+
|
5
|
+
@dataclass
|
6
|
+
class ExtractionPolicy:
|
7
|
+
extractor: str
|
8
|
+
name: str
|
9
|
+
content_source: str
|
10
|
+
input_params: Optional[dict] = None
|
11
|
+
id: Optional[str] = None
|
12
|
+
labels_eq: Optional[str] = None
|
13
|
+
|
14
|
+
def __repr__(self) -> str:
|
15
|
+
return f"ExtractionPolicy(name={self.name} extractor={self.extractor})"
|
16
|
+
|
17
|
+
def __str__(self) -> str:
|
18
|
+
return self.__repr__()
|
19
|
+
|
20
|
+
def to_dict(self) -> dict:
|
21
|
+
filtered_dict = {k: v for k, v in asdict(self).items() if v is not None}
|
22
|
+
return filtered_dict
|
23
|
+
|
24
|
+
@classmethod
|
25
|
+
def from_dict(cls, json: dict):
|
26
|
+
if "filters_eq" in json:
|
27
|
+
json["labels_eq"] = json.pop("filters_eq")
|
28
|
+
json["id"] = json.get("id", None)
|
29
|
+
return ExtractionPolicy(**json)
|
30
|
+
|
31
|
+
|
32
|
+
@dataclass
|
33
|
+
class ExtractionGraph:
|
34
|
+
id: str
|
35
|
+
name: str
|
36
|
+
extraction_policies: List[ExtractionPolicy]
|
37
|
+
|
38
|
+
@classmethod
|
39
|
+
def from_dict(cls, json: dict):
|
40
|
+
json["id"] = json.get("id", None)
|
41
|
+
if "namespace" in json.keys():
|
42
|
+
json.pop("namespace")
|
43
|
+
return ExtractionGraph(**json)
|
44
|
+
|
45
|
+
@staticmethod
|
46
|
+
def from_yaml(spec: str):
|
47
|
+
import yaml
|
48
|
+
|
49
|
+
return ExtractionGraph.from_dict(yaml.load(spec, Loader=yaml.FullLoader))
|
50
|
+
|
51
|
+
def to_dict(self) -> dict:
|
52
|
+
filtered_dict = {k: v for k, v in asdict(self).items() if v is not None}
|
53
|
+
return filtered_dict
|
54
|
+
|
55
|
+
|
56
|
+
class ExtractionGraphBuilder:
|
57
|
+
def __init__(self, name: str):
|
58
|
+
self.name = name
|
59
|
+
self.extraction_policies = []
|
60
|
+
|
61
|
+
def policy(self, policy: ExtractionPolicy) -> "ExtractionGraphBuilder":
|
62
|
+
self.extraction_policies.append(policy)
|
63
|
+
return self
|
64
|
+
|
65
|
+
def build(self):
|
66
|
+
return ExtractionGraph(
|
67
|
+
id=self.id, name=self.name, extraction_policies=self.extraction_policies
|
68
|
+
)
|
@@ -17,7 +17,12 @@ class ExtractorSchema:
|
|
17
17
|
|
18
18
|
class Extractor:
|
19
19
|
def __init__(
|
20
|
-
self,
|
20
|
+
self,
|
21
|
+
name: str,
|
22
|
+
description: str,
|
23
|
+
input_params: dict,
|
24
|
+
outputs: ExtractorSchema,
|
25
|
+
input_mime_types: list[str],
|
21
26
|
):
|
22
27
|
self.name = name
|
23
28
|
self.description = description
|
@@ -1,28 +0,0 @@
|
|
1
|
-
from dataclasses import dataclass, asdict
|
2
|
-
from typing import Optional
|
3
|
-
|
4
|
-
|
5
|
-
@dataclass
|
6
|
-
class ExtractionPolicy:
|
7
|
-
extractor: str
|
8
|
-
name: str
|
9
|
-
content_source: str
|
10
|
-
input_params: dict
|
11
|
-
id: Optional[str] = None
|
12
|
-
labels_eq: Optional[str] = None
|
13
|
-
|
14
|
-
def __repr__(self) -> str:
|
15
|
-
return f"ExtractionPolicy(name={self.name} extractor={self.extractor})"
|
16
|
-
|
17
|
-
def __str__(self) -> str:
|
18
|
-
return self.__repr__()
|
19
|
-
|
20
|
-
def to_dict(self) -> dict:
|
21
|
-
filtered_dict = {k: v for k, v in asdict(self).items() if v is not None}
|
22
|
-
return filtered_dict
|
23
|
-
|
24
|
-
@classmethod
|
25
|
-
def from_dict(cls, json: dict):
|
26
|
-
if "filters_eq" in json:
|
27
|
-
json["labels_eq"] = json.pop("filters_eq")
|
28
|
-
return ExtractionPolicy(**json)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|