unstructured-ingest 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_sharepoint.py +161 -10
- test/unit/v2/embedders/test_bedrock.py +1 -1
- test/unit/v2/embedders/test_huggingface.py +1 -1
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/azure_openai.py +6 -0
- unstructured_ingest/embed/bedrock.py +16 -6
- unstructured_ingest/embed/huggingface.py +3 -1
- unstructured_ingest/embed/interfaces.py +61 -23
- unstructured_ingest/embed/mixedbreadai.py +28 -114
- unstructured_ingest/embed/octoai.py +19 -51
- unstructured_ingest/embed/openai.py +17 -55
- unstructured_ingest/embed/togetherai.py +16 -58
- unstructured_ingest/embed/vertexai.py +15 -46
- unstructured_ingest/embed/voyageai.py +17 -52
- unstructured_ingest/v2/errors.py +7 -0
- unstructured_ingest/v2/processes/connectors/neo4j.py +129 -43
- unstructured_ingest/v2/processes/connectors/sharepoint.py +9 -4
- unstructured_ingest/v2/processes/embedder.py +9 -7
- {unstructured_ingest-0.5.2.dist-info → unstructured_ingest-0.5.4.dist-info}/METADATA +101 -89
- {unstructured_ingest-0.5.2.dist-info → unstructured_ingest-0.5.4.dist-info}/RECORD +24 -24
- {unstructured_ingest-0.5.2.dist-info → unstructured_ingest-0.5.4.dist-info}/WHEEL +1 -1
- {unstructured_ingest-0.5.2.dist-info → unstructured_ingest-0.5.4.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.2.dist-info → unstructured_ingest-0.5.4.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.2.dist-info → unstructured_ingest-0.5.4.dist-info}/top_level.txt +0 -0
|
@@ -8,9 +8,9 @@ from contextlib import asynccontextmanager
|
|
|
8
8
|
from dataclasses import dataclass
|
|
9
9
|
from enum import Enum
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import TYPE_CHECKING, Any, AsyncGenerator, Optional
|
|
11
|
+
from typing import TYPE_CHECKING, Any, AsyncGenerator, Literal, Optional
|
|
12
12
|
|
|
13
|
-
from pydantic import BaseModel, ConfigDict, Field, Secret
|
|
13
|
+
from pydantic import BaseModel, ConfigDict, Field, Secret, field_validator
|
|
14
14
|
|
|
15
15
|
from unstructured_ingest.error import DestinationConnectionError
|
|
16
16
|
from unstructured_ingest.logger import logger
|
|
@@ -30,6 +30,8 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
30
30
|
DestinationRegistryEntry,
|
|
31
31
|
)
|
|
32
32
|
|
|
33
|
+
SimilarityFunction = Literal["cosine"]
|
|
34
|
+
|
|
33
35
|
if TYPE_CHECKING:
|
|
34
36
|
from neo4j import AsyncDriver, Auth
|
|
35
37
|
from networkx import Graph, MultiDiGraph
|
|
@@ -44,9 +46,9 @@ class Neo4jAccessConfig(AccessConfig):
|
|
|
44
46
|
class Neo4jConnectionConfig(ConnectionConfig):
|
|
45
47
|
access_config: Secret[Neo4jAccessConfig]
|
|
46
48
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
47
|
-
username: str
|
|
49
|
+
username: str = Field(default="neo4j")
|
|
48
50
|
uri: str = Field(description="Neo4j Connection URI <scheme>://<host>:<port>")
|
|
49
|
-
database: str = Field(description="Name of the target database")
|
|
51
|
+
database: str = Field(default="neo4j", description="Name of the target database")
|
|
50
52
|
|
|
51
53
|
@requires_dependencies(["neo4j"], extras="neo4j")
|
|
52
54
|
@asynccontextmanager
|
|
@@ -186,8 +188,8 @@ class _GraphData(BaseModel):
|
|
|
186
188
|
nodes = list(nx_graph.nodes())
|
|
187
189
|
edges = [
|
|
188
190
|
_Edge(
|
|
189
|
-
|
|
190
|
-
|
|
191
|
+
source=u,
|
|
192
|
+
destination=v,
|
|
191
193
|
relationship=Relationship(data_dict["relationship"]),
|
|
192
194
|
)
|
|
193
195
|
for u, v, data_dict in nx_graph.edges(data=True)
|
|
@@ -198,19 +200,30 @@ class _GraphData(BaseModel):
|
|
|
198
200
|
class _Node(BaseModel):
|
|
199
201
|
model_config = ConfigDict()
|
|
200
202
|
|
|
201
|
-
|
|
202
|
-
labels: list[Label] = Field(default_factory=list)
|
|
203
|
+
labels: list[Label]
|
|
203
204
|
properties: dict = Field(default_factory=dict)
|
|
205
|
+
id_: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
|
204
206
|
|
|
205
207
|
def __hash__(self):
|
|
206
208
|
return hash(self.id_)
|
|
207
209
|
|
|
210
|
+
@property
|
|
211
|
+
def main_label(self) -> Label:
|
|
212
|
+
return self.labels[0]
|
|
213
|
+
|
|
214
|
+
@classmethod
|
|
215
|
+
@field_validator("labels", mode="after")
|
|
216
|
+
def require_at_least_one_label(cls, value: list[Label]) -> list[Label]:
|
|
217
|
+
if not value:
|
|
218
|
+
raise ValueError("Node must have at least one label.")
|
|
219
|
+
return value
|
|
220
|
+
|
|
208
221
|
|
|
209
222
|
class _Edge(BaseModel):
|
|
210
223
|
model_config = ConfigDict()
|
|
211
224
|
|
|
212
|
-
|
|
213
|
-
|
|
225
|
+
source: _Node
|
|
226
|
+
destination: _Node
|
|
214
227
|
relationship: Relationship
|
|
215
228
|
|
|
216
229
|
|
|
@@ -229,7 +242,14 @@ class Relationship(Enum):
|
|
|
229
242
|
|
|
230
243
|
class Neo4jUploaderConfig(UploaderConfig):
|
|
231
244
|
batch_size: int = Field(
|
|
232
|
-
default=
|
|
245
|
+
default=1000, description="Maximal number of nodes/relationships created per transaction."
|
|
246
|
+
)
|
|
247
|
+
similarity_function: SimilarityFunction = Field(
|
|
248
|
+
default="cosine",
|
|
249
|
+
description="Vector similarity function used to create index on Chunk nodes",
|
|
250
|
+
)
|
|
251
|
+
create_destination: bool = Field(
|
|
252
|
+
default=True, description="Create destination if it does not exist"
|
|
233
253
|
)
|
|
234
254
|
|
|
235
255
|
|
|
@@ -257,6 +277,13 @@ class Neo4jUploader(Uploader):
|
|
|
257
277
|
graph_data = _GraphData.model_validate(staged_data)
|
|
258
278
|
async with self.connection_config.get_client() as client:
|
|
259
279
|
await self._create_uniqueness_constraints(client)
|
|
280
|
+
embedding_dimensions = self._get_embedding_dimensions(graph_data)
|
|
281
|
+
if embedding_dimensions and self.upload_config.create_destination:
|
|
282
|
+
await self._create_vector_index(
|
|
283
|
+
client,
|
|
284
|
+
dimensions=embedding_dimensions,
|
|
285
|
+
similarity_function=self.upload_config.similarity_function,
|
|
286
|
+
)
|
|
260
287
|
await self._delete_old_data_if_exists(file_data, client=client)
|
|
261
288
|
await self._merge_graph(graph_data=graph_data, client=client)
|
|
262
289
|
|
|
@@ -274,13 +301,33 @@ class Neo4jUploader(Uploader):
|
|
|
274
301
|
"""
|
|
275
302
|
)
|
|
276
303
|
|
|
304
|
+
async def _create_vector_index(
|
|
305
|
+
self, client: AsyncDriver, dimensions: int, similarity_function: SimilarityFunction
|
|
306
|
+
) -> None:
|
|
307
|
+
label = Label.CHUNK
|
|
308
|
+
logger.info(
|
|
309
|
+
f"Creating index on nodes labeled '{label.value}' if it does not already exist."
|
|
310
|
+
)
|
|
311
|
+
index_name = f"{label.value.lower()}_vector"
|
|
312
|
+
await client.execute_query(
|
|
313
|
+
f"""
|
|
314
|
+
CREATE VECTOR INDEX {index_name} IF NOT EXISTS
|
|
315
|
+
FOR (n:{label.value}) ON n.embedding
|
|
316
|
+
OPTIONS {{indexConfig: {{
|
|
317
|
+
`vector.similarity_function`: '{similarity_function}',
|
|
318
|
+
`vector.dimensions`: {dimensions}}}
|
|
319
|
+
}}
|
|
320
|
+
"""
|
|
321
|
+
)
|
|
322
|
+
|
|
277
323
|
async def _delete_old_data_if_exists(self, file_data: FileData, client: AsyncDriver) -> None:
|
|
278
324
|
logger.info(f"Deleting old data for the record '{file_data.identifier}' (if present).")
|
|
279
325
|
_, summary, _ = await client.execute_query(
|
|
280
326
|
f"""
|
|
281
|
-
MATCH (n: {Label.DOCUMENT.value} {{id: $identifier}})
|
|
282
|
-
MATCH (n)--(m: {Label.CHUNK.value}
|
|
283
|
-
DETACH DELETE m
|
|
327
|
+
MATCH (n: `{Label.DOCUMENT.value}` {{id: $identifier}})
|
|
328
|
+
MATCH (n)--(m: `{Label.CHUNK.value}`|`{Label.UNSTRUCTURED_ELEMENT.value}`)
|
|
329
|
+
DETACH DELETE m
|
|
330
|
+
DETACH DELETE n""",
|
|
284
331
|
identifier=file_data.identifier,
|
|
285
332
|
)
|
|
286
333
|
logger.info(
|
|
@@ -289,16 +336,15 @@ class Neo4jUploader(Uploader):
|
|
|
289
336
|
)
|
|
290
337
|
|
|
291
338
|
async def _merge_graph(self, graph_data: _GraphData, client: AsyncDriver) -> None:
|
|
292
|
-
nodes_by_labels: defaultdict[
|
|
339
|
+
nodes_by_labels: defaultdict[Label, list[_Node]] = defaultdict(list)
|
|
293
340
|
for node in graph_data.nodes:
|
|
294
|
-
nodes_by_labels[
|
|
295
|
-
|
|
341
|
+
nodes_by_labels[node.main_label].append(node)
|
|
296
342
|
logger.info(f"Merging {len(graph_data.nodes)} graph nodes.")
|
|
297
343
|
# NOTE: Processed in parallel as there's no overlap between accessed nodes
|
|
298
344
|
await self._execute_queries(
|
|
299
345
|
[
|
|
300
|
-
self._create_nodes_query(nodes_batch,
|
|
301
|
-
for
|
|
346
|
+
self._create_nodes_query(nodes_batch, label)
|
|
347
|
+
for label, nodes in nodes_by_labels.items()
|
|
302
348
|
for nodes_batch in batch_generator(nodes, batch_size=self.upload_config.batch_size)
|
|
303
349
|
],
|
|
304
350
|
client=client,
|
|
@@ -306,16 +352,23 @@ class Neo4jUploader(Uploader):
|
|
|
306
352
|
)
|
|
307
353
|
logger.info(f"Finished merging {len(graph_data.nodes)} graph nodes.")
|
|
308
354
|
|
|
309
|
-
edges_by_relationship: defaultdict[Relationship, list[_Edge]] =
|
|
355
|
+
edges_by_relationship: defaultdict[tuple[Relationship, Label, Label], list[_Edge]] = (
|
|
356
|
+
defaultdict(list)
|
|
357
|
+
)
|
|
310
358
|
for edge in graph_data.edges:
|
|
311
|
-
|
|
359
|
+
key = (edge.relationship, edge.source.main_label, edge.destination.main_label)
|
|
360
|
+
edges_by_relationship[key].append(edge)
|
|
312
361
|
|
|
313
362
|
logger.info(f"Merging {len(graph_data.edges)} graph relationships (edges).")
|
|
314
363
|
# NOTE: Processed sequentially to avoid queries locking node access to one another
|
|
315
364
|
await self._execute_queries(
|
|
316
365
|
[
|
|
317
|
-
self._create_edges_query(edges_batch, relationship)
|
|
318
|
-
for
|
|
366
|
+
self._create_edges_query(edges_batch, relationship, source_label, destination_label)
|
|
367
|
+
for (
|
|
368
|
+
relationship,
|
|
369
|
+
source_label,
|
|
370
|
+
destination_label,
|
|
371
|
+
), edges in edges_by_relationship.items()
|
|
319
372
|
for edges_batch in batch_generator(edges, batch_size=self.upload_config.batch_size)
|
|
320
373
|
],
|
|
321
374
|
client=client,
|
|
@@ -328,53 +381,86 @@ class Neo4jUploader(Uploader):
|
|
|
328
381
|
client: AsyncDriver,
|
|
329
382
|
in_parallel: bool = False,
|
|
330
383
|
) -> None:
|
|
384
|
+
from neo4j import EagerResult
|
|
385
|
+
|
|
386
|
+
results: list[EagerResult] = []
|
|
387
|
+
logger.info(
|
|
388
|
+
f"Executing {len(queries_with_parameters)} "
|
|
389
|
+
+ f"{'parallel' if in_parallel else 'sequential'} Cypher statements."
|
|
390
|
+
)
|
|
331
391
|
if in_parallel:
|
|
332
|
-
|
|
333
|
-
await asyncio.gather(
|
|
392
|
+
results = await asyncio.gather(
|
|
334
393
|
*[
|
|
335
394
|
client.execute_query(query, parameters_=parameters)
|
|
336
395
|
for query, parameters in queries_with_parameters
|
|
337
396
|
]
|
|
338
397
|
)
|
|
339
|
-
logger.info("Finished executing parallel queries.")
|
|
340
398
|
else:
|
|
341
|
-
logger.info(f"Executing {len(queries_with_parameters)} queries sequentially.")
|
|
342
399
|
for i, (query, parameters) in enumerate(queries_with_parameters):
|
|
343
|
-
logger.info(f"
|
|
344
|
-
await client.execute_query(query, parameters_=parameters)
|
|
345
|
-
logger.info(f"
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
400
|
+
logger.info(f"Statement #{i} started.")
|
|
401
|
+
results.append(await client.execute_query(query, parameters_=parameters))
|
|
402
|
+
logger.info(f"Statement #{i} finished.")
|
|
403
|
+
nodeCount = sum([res.summary.counters.nodes_created for res in results])
|
|
404
|
+
relCount = sum([res.summary.counters.relationships_created for res in results])
|
|
405
|
+
logger.info(
|
|
406
|
+
f"Finished executing all ({len(queries_with_parameters)}) "
|
|
407
|
+
+ f"{'parallel' if in_parallel else 'sequential'} Cypher statements. "
|
|
408
|
+
+ f"Created {nodeCount} nodes, {relCount} relationships."
|
|
409
|
+
)
|
|
349
410
|
|
|
350
411
|
@staticmethod
|
|
351
|
-
def _create_nodes_query(nodes: list[_Node],
|
|
352
|
-
|
|
353
|
-
logger.info(f"Preparing MERGE query for {len(nodes)} nodes labeled '{labels_string}'.")
|
|
412
|
+
def _create_nodes_query(nodes: list[_Node], label: Label) -> tuple[str, dict]:
|
|
413
|
+
logger.info(f"Preparing MERGE query for {len(nodes)} nodes labeled '{label}'.")
|
|
354
414
|
query_string = f"""
|
|
355
415
|
UNWIND $nodes AS node
|
|
356
|
-
MERGE (n: {
|
|
416
|
+
MERGE (n: `{label.value}` {{id: node.id}})
|
|
357
417
|
SET n += node.properties
|
|
418
|
+
SET n:$(node.labels)
|
|
419
|
+
WITH * WHERE node.vector IS NOT NULL
|
|
420
|
+
CALL db.create.setNodeVectorProperty(n, 'embedding', node.vector)
|
|
358
421
|
"""
|
|
359
|
-
parameters = {
|
|
422
|
+
parameters = {
|
|
423
|
+
"nodes": [
|
|
424
|
+
{
|
|
425
|
+
"id": node.id_,
|
|
426
|
+
"labels": [l.value for l in node.labels if l != label], # noqa: E741
|
|
427
|
+
"vector": node.properties.pop("embedding", None),
|
|
428
|
+
"properties": node.properties,
|
|
429
|
+
}
|
|
430
|
+
for node in nodes
|
|
431
|
+
]
|
|
432
|
+
}
|
|
360
433
|
return query_string, parameters
|
|
361
434
|
|
|
362
435
|
@staticmethod
|
|
363
|
-
def _create_edges_query(
|
|
436
|
+
def _create_edges_query(
|
|
437
|
+
edges: list[_Edge],
|
|
438
|
+
relationship: Relationship,
|
|
439
|
+
source_label: Label,
|
|
440
|
+
destination_label: Label,
|
|
441
|
+
) -> tuple[str, dict]:
|
|
364
442
|
logger.info(f"Preparing MERGE query for {len(edges)} {relationship} relationships.")
|
|
365
443
|
query_string = f"""
|
|
366
444
|
UNWIND $edges AS edge
|
|
367
|
-
MATCH (u {{id: edge.source}})
|
|
368
|
-
MATCH (v {{id: edge.destination}})
|
|
369
|
-
MERGE (u)-[
|
|
445
|
+
MATCH (u: `{source_label.value}` {{id: edge.source}})
|
|
446
|
+
MATCH (v: `{destination_label.value}` {{id: edge.destination}})
|
|
447
|
+
MERGE (u)-[:`{relationship.value}`]->(v)
|
|
370
448
|
"""
|
|
371
449
|
parameters = {
|
|
372
450
|
"edges": [
|
|
373
|
-
{"source": edge.
|
|
451
|
+
{"source": edge.source.id_, "destination": edge.destination.id_} for edge in edges
|
|
374
452
|
]
|
|
375
453
|
}
|
|
376
454
|
return query_string, parameters
|
|
377
455
|
|
|
456
|
+
def _get_embedding_dimensions(self, graph_data: _GraphData) -> int | None:
|
|
457
|
+
"""Embedding dimensions inferred from chunk nodes or None if it can't be determined."""
|
|
458
|
+
for node in graph_data.nodes:
|
|
459
|
+
if Label.CHUNK in node.labels and "embeddings" in node.properties:
|
|
460
|
+
return len(node.properties["embeddings"])
|
|
461
|
+
|
|
462
|
+
return None
|
|
463
|
+
|
|
378
464
|
|
|
379
465
|
neo4j_destination_entry = DestinationRegistryEntry(
|
|
380
466
|
connection_config=Neo4jConnectionConfig,
|
|
@@ -31,6 +31,7 @@ if TYPE_CHECKING:
|
|
|
31
31
|
from office365.onedrive.driveitems.driveItem import DriveItem
|
|
32
32
|
|
|
33
33
|
CONNECTOR_TYPE = "sharepoint"
|
|
34
|
+
LEGACY_DEFAULT_PATH = "Shared Documents"
|
|
34
35
|
|
|
35
36
|
|
|
36
37
|
class SharepointAccessConfig(OnedriveAccessConfig):
|
|
@@ -76,10 +77,14 @@ class SharepointIndexer(OnedriveIndexer):
|
|
|
76
77
|
except ClientRequestException:
|
|
77
78
|
logger.info("Site not found")
|
|
78
79
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
80
|
+
path = self.index_config.path
|
|
81
|
+
# Deprecated sharepoint sdk needed a default path. Microsoft Graph SDK does not.
|
|
82
|
+
if path and path != LEGACY_DEFAULT_PATH:
|
|
83
|
+
site_drive_item = site_drive_item.get_by_path(path).get().execute_query()
|
|
84
|
+
|
|
85
|
+
for drive_item in site_drive_item.get_files(
|
|
86
|
+
recursive=self.index_config.recursive
|
|
87
|
+
).execute_query():
|
|
83
88
|
file_data = await self.drive_item_to_file_data(drive_item=drive_item)
|
|
84
89
|
yield file_data
|
|
85
90
|
|
|
@@ -92,18 +92,20 @@ class EmbedderConfig(BaseModel):
|
|
|
92
92
|
|
|
93
93
|
return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig.model_validate(embedding_kwargs))
|
|
94
94
|
|
|
95
|
-
def get_bedrock_embedder(self) -> "BaseEmbeddingEncoder":
|
|
95
|
+
def get_bedrock_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
96
96
|
from unstructured_ingest.embed.bedrock import (
|
|
97
97
|
BedrockEmbeddingConfig,
|
|
98
98
|
BedrockEmbeddingEncoder,
|
|
99
99
|
)
|
|
100
100
|
|
|
101
|
+
embedding_kwargs = embedding_kwargs | {
|
|
102
|
+
"aws_access_key_id": self.embedding_aws_access_key_id,
|
|
103
|
+
"aws_secret_access_key": self.embedding_aws_secret_access_key.get_secret_value(),
|
|
104
|
+
"region_name": self.embedding_aws_region,
|
|
105
|
+
}
|
|
106
|
+
|
|
101
107
|
return BedrockEmbeddingEncoder(
|
|
102
|
-
config=BedrockEmbeddingConfig(
|
|
103
|
-
aws_access_key_id=self.embedding_aws_access_key_id,
|
|
104
|
-
aws_secret_access_key=self.embedding_aws_secret_access_key.get_secret_value(),
|
|
105
|
-
region_name=self.embedding_aws_region,
|
|
106
|
-
)
|
|
108
|
+
config=BedrockEmbeddingConfig.model_validate(embedding_kwargs)
|
|
107
109
|
)
|
|
108
110
|
|
|
109
111
|
def get_vertexai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
@@ -163,7 +165,7 @@ class EmbedderConfig(BaseModel):
|
|
|
163
165
|
return self.get_octoai_embedder(embedding_kwargs=kwargs)
|
|
164
166
|
|
|
165
167
|
if self.embedding_provider == "bedrock":
|
|
166
|
-
return self.get_bedrock_embedder()
|
|
168
|
+
return self.get_bedrock_embedder(embedding_kwargs=kwargs)
|
|
167
169
|
|
|
168
170
|
if self.embedding_provider == "vertexai":
|
|
169
171
|
return self.get_vertexai_embedder(embedding_kwargs=kwargs)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.4
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,13 +22,45 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.14
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
+
Requires-Dist: opentelemetry-sdk
|
|
25
26
|
Requires-Dist: pandas
|
|
26
|
-
Requires-Dist: pydantic>=2.7
|
|
27
|
-
Requires-Dist: dataclasses-json
|
|
28
27
|
Requires-Dist: python-dateutil
|
|
29
|
-
Requires-Dist:
|
|
30
|
-
Requires-Dist: click
|
|
28
|
+
Requires-Dist: dataclasses_json
|
|
31
29
|
Requires-Dist: tqdm
|
|
30
|
+
Requires-Dist: pydantic>=2.7
|
|
31
|
+
Requires-Dist: click
|
|
32
|
+
Provides-Extra: remote
|
|
33
|
+
Requires-Dist: unstructured-client>=0.26.1; extra == "remote"
|
|
34
|
+
Provides-Extra: csv
|
|
35
|
+
Requires-Dist: unstructured[tsv]; extra == "csv"
|
|
36
|
+
Provides-Extra: doc
|
|
37
|
+
Requires-Dist: unstructured[docx]; extra == "doc"
|
|
38
|
+
Provides-Extra: docx
|
|
39
|
+
Requires-Dist: unstructured[docx]; extra == "docx"
|
|
40
|
+
Provides-Extra: epub
|
|
41
|
+
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
42
|
+
Provides-Extra: md
|
|
43
|
+
Requires-Dist: unstructured[md]; extra == "md"
|
|
44
|
+
Provides-Extra: msg
|
|
45
|
+
Requires-Dist: unstructured[msg]; extra == "msg"
|
|
46
|
+
Provides-Extra: odt
|
|
47
|
+
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
48
|
+
Provides-Extra: org
|
|
49
|
+
Requires-Dist: unstructured[org]; extra == "org"
|
|
50
|
+
Provides-Extra: pdf
|
|
51
|
+
Requires-Dist: unstructured[pdf]; extra == "pdf"
|
|
52
|
+
Provides-Extra: ppt
|
|
53
|
+
Requires-Dist: unstructured[pptx]; extra == "ppt"
|
|
54
|
+
Provides-Extra: pptx
|
|
55
|
+
Requires-Dist: unstructured[pptx]; extra == "pptx"
|
|
56
|
+
Provides-Extra: rtf
|
|
57
|
+
Requires-Dist: unstructured[rtf]; extra == "rtf"
|
|
58
|
+
Provides-Extra: rst
|
|
59
|
+
Requires-Dist: unstructured[rst]; extra == "rst"
|
|
60
|
+
Provides-Extra: tsv
|
|
61
|
+
Requires-Dist: unstructured[tsv]; extra == "tsv"
|
|
62
|
+
Provides-Extra: xlsx
|
|
63
|
+
Requires-Dist: unstructured[xlsx]; extra == "xlsx"
|
|
32
64
|
Provides-Extra: airtable
|
|
33
65
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
34
66
|
Provides-Extra: astradb
|
|
@@ -38,63 +70,37 @@ Requires-Dist: fsspec; extra == "azure"
|
|
|
38
70
|
Requires-Dist: adlfs; extra == "azure"
|
|
39
71
|
Provides-Extra: azure-ai-search
|
|
40
72
|
Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
41
|
-
Provides-Extra: bedrock
|
|
42
|
-
Requires-Dist: boto3; extra == "bedrock"
|
|
43
|
-
Requires-Dist: aioboto3; extra == "bedrock"
|
|
44
73
|
Provides-Extra: biomed
|
|
45
74
|
Requires-Dist: requests; extra == "biomed"
|
|
46
75
|
Requires-Dist: bs4; extra == "biomed"
|
|
47
76
|
Provides-Extra: box
|
|
48
|
-
Requires-Dist: fsspec; extra == "box"
|
|
49
77
|
Requires-Dist: boxfs; extra == "box"
|
|
78
|
+
Requires-Dist: fsspec; extra == "box"
|
|
50
79
|
Provides-Extra: chroma
|
|
51
80
|
Requires-Dist: chromadb; extra == "chroma"
|
|
52
81
|
Provides-Extra: clarifai
|
|
53
82
|
Requires-Dist: clarifai; extra == "clarifai"
|
|
54
83
|
Provides-Extra: confluence
|
|
55
|
-
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
56
84
|
Requires-Dist: requests; extra == "confluence"
|
|
85
|
+
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
57
86
|
Provides-Extra: couchbase
|
|
58
87
|
Requires-Dist: couchbase; extra == "couchbase"
|
|
59
|
-
Provides-Extra: csv
|
|
60
|
-
Requires-Dist: unstructured[tsv]; extra == "csv"
|
|
61
|
-
Provides-Extra: databricks-delta-tables
|
|
62
|
-
Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
|
|
63
|
-
Provides-Extra: databricks-volumes
|
|
64
|
-
Requires-Dist: databricks-sdk; extra == "databricks-volumes"
|
|
65
88
|
Provides-Extra: delta-table
|
|
66
|
-
Requires-Dist: boto3; extra == "delta-table"
|
|
67
89
|
Requires-Dist: deltalake; extra == "delta-table"
|
|
90
|
+
Requires-Dist: boto3; extra == "delta-table"
|
|
68
91
|
Provides-Extra: discord
|
|
69
92
|
Requires-Dist: discord.py; extra == "discord"
|
|
70
|
-
Provides-Extra: doc
|
|
71
|
-
Requires-Dist: unstructured[docx]; extra == "doc"
|
|
72
|
-
Provides-Extra: docx
|
|
73
|
-
Requires-Dist: unstructured[docx]; extra == "docx"
|
|
74
93
|
Provides-Extra: dropbox
|
|
75
|
-
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
76
94
|
Requires-Dist: fsspec; extra == "dropbox"
|
|
95
|
+
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
77
96
|
Provides-Extra: duckdb
|
|
78
97
|
Requires-Dist: duckdb; extra == "duckdb"
|
|
79
98
|
Provides-Extra: elasticsearch
|
|
80
99
|
Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
|
|
81
|
-
Provides-Extra: embed-huggingface
|
|
82
|
-
Requires-Dist: sentence-transformers; extra == "embed-huggingface"
|
|
83
|
-
Provides-Extra: embed-mixedbreadai
|
|
84
|
-
Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
|
|
85
|
-
Provides-Extra: embed-octoai
|
|
86
|
-
Requires-Dist: openai; extra == "embed-octoai"
|
|
87
|
-
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
88
|
-
Provides-Extra: embed-vertexai
|
|
89
|
-
Requires-Dist: vertexai; extra == "embed-vertexai"
|
|
90
|
-
Provides-Extra: embed-voyageai
|
|
91
|
-
Requires-Dist: voyageai; extra == "embed-voyageai"
|
|
92
|
-
Provides-Extra: epub
|
|
93
|
-
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
94
100
|
Provides-Extra: gcs
|
|
101
|
+
Requires-Dist: gcsfs; extra == "gcs"
|
|
95
102
|
Requires-Dist: fsspec; extra == "gcs"
|
|
96
103
|
Requires-Dist: bs4; extra == "gcs"
|
|
97
|
-
Requires-Dist: gcsfs; extra == "gcs"
|
|
98
104
|
Provides-Extra: github
|
|
99
105
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
100
106
|
Requires-Dist: requests; extra == "github"
|
|
@@ -113,97 +119,103 @@ Provides-Extra: kdbai
|
|
|
113
119
|
Requires-Dist: kdbai-client>=1.4.0; extra == "kdbai"
|
|
114
120
|
Provides-Extra: lancedb
|
|
115
121
|
Requires-Dist: lancedb; extra == "lancedb"
|
|
116
|
-
Provides-Extra: md
|
|
117
|
-
Requires-Dist: unstructured[md]; extra == "md"
|
|
118
122
|
Provides-Extra: milvus
|
|
119
123
|
Requires-Dist: pymilvus; extra == "milvus"
|
|
120
124
|
Provides-Extra: mongodb
|
|
121
125
|
Requires-Dist: pymongo; extra == "mongodb"
|
|
122
|
-
Provides-Extra: msg
|
|
123
|
-
Requires-Dist: unstructured[msg]; extra == "msg"
|
|
124
126
|
Provides-Extra: neo4j
|
|
125
|
-
Requires-Dist: networkx; extra == "neo4j"
|
|
126
127
|
Requires-Dist: cymple; extra == "neo4j"
|
|
127
|
-
Requires-Dist:
|
|
128
|
+
Requires-Dist: networkx; extra == "neo4j"
|
|
129
|
+
Requires-Dist: neo4j-rust-ext; extra == "neo4j"
|
|
128
130
|
Provides-Extra: notion
|
|
129
|
-
Requires-Dist: htmlBuilder; extra == "notion"
|
|
130
|
-
Requires-Dist: backoff; extra == "notion"
|
|
131
|
-
Requires-Dist: notion-client; extra == "notion"
|
|
132
131
|
Requires-Dist: httpx; extra == "notion"
|
|
133
|
-
|
|
134
|
-
Requires-Dist:
|
|
132
|
+
Requires-Dist: notion-client; extra == "notion"
|
|
133
|
+
Requires-Dist: backoff; extra == "notion"
|
|
134
|
+
Requires-Dist: htmlBuilder; extra == "notion"
|
|
135
135
|
Provides-Extra: onedrive
|
|
136
136
|
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
137
|
-
Requires-Dist: bs4; extra == "onedrive"
|
|
138
137
|
Requires-Dist: msal; extra == "onedrive"
|
|
139
|
-
|
|
140
|
-
Requires-Dist: openai; extra == "openai"
|
|
141
|
-
Requires-Dist: tiktoken; extra == "openai"
|
|
138
|
+
Requires-Dist: bs4; extra == "onedrive"
|
|
142
139
|
Provides-Extra: opensearch
|
|
143
140
|
Requires-Dist: opensearch-py; extra == "opensearch"
|
|
144
|
-
Provides-Extra: org
|
|
145
|
-
Requires-Dist: unstructured[org]; extra == "org"
|
|
146
141
|
Provides-Extra: outlook
|
|
147
142
|
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
148
143
|
Requires-Dist: msal; extra == "outlook"
|
|
149
|
-
Provides-Extra: pdf
|
|
150
|
-
Requires-Dist: unstructured[pdf]; extra == "pdf"
|
|
151
144
|
Provides-Extra: pinecone
|
|
152
145
|
Requires-Dist: pinecone-client>=3.7.1; extra == "pinecone"
|
|
153
146
|
Provides-Extra: postgres
|
|
154
147
|
Requires-Dist: psycopg2-binary; extra == "postgres"
|
|
155
|
-
Provides-Extra: ppt
|
|
156
|
-
Requires-Dist: unstructured[pptx]; extra == "ppt"
|
|
157
|
-
Provides-Extra: pptx
|
|
158
|
-
Requires-Dist: unstructured[pptx]; extra == "pptx"
|
|
159
148
|
Provides-Extra: qdrant
|
|
160
149
|
Requires-Dist: qdrant-client; extra == "qdrant"
|
|
161
150
|
Provides-Extra: reddit
|
|
162
151
|
Requires-Dist: praw; extra == "reddit"
|
|
163
152
|
Provides-Extra: redis
|
|
164
153
|
Requires-Dist: redis; extra == "redis"
|
|
165
|
-
Provides-Extra: remote
|
|
166
|
-
Requires-Dist: unstructured-client>=0.26.1; extra == "remote"
|
|
167
|
-
Provides-Extra: rst
|
|
168
|
-
Requires-Dist: unstructured[rst]; extra == "rst"
|
|
169
|
-
Provides-Extra: rtf
|
|
170
|
-
Requires-Dist: unstructured[rtf]; extra == "rtf"
|
|
171
154
|
Provides-Extra: s3
|
|
172
|
-
Requires-Dist: s3fs; extra == "s3"
|
|
173
155
|
Requires-Dist: fsspec; extra == "s3"
|
|
156
|
+
Requires-Dist: s3fs; extra == "s3"
|
|
157
|
+
Provides-Extra: sharepoint
|
|
158
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
159
|
+
Requires-Dist: msal; extra == "sharepoint"
|
|
174
160
|
Provides-Extra: salesforce
|
|
175
161
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
176
162
|
Provides-Extra: sftp
|
|
177
|
-
Requires-Dist: fsspec; extra == "sftp"
|
|
178
163
|
Requires-Dist: paramiko; extra == "sftp"
|
|
179
|
-
|
|
180
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
181
|
-
Requires-Dist: msal; extra == "sharepoint"
|
|
182
|
-
Provides-Extra: singlestore
|
|
183
|
-
Requires-Dist: singlestoredb; extra == "singlestore"
|
|
164
|
+
Requires-Dist: fsspec; extra == "sftp"
|
|
184
165
|
Provides-Extra: slack
|
|
185
|
-
Requires-Dist:
|
|
166
|
+
Requires-Dist: slack_sdk[optional]; extra == "slack"
|
|
186
167
|
Provides-Extra: snowflake
|
|
187
|
-
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
188
168
|
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
Requires-Dist:
|
|
196
|
-
|
|
169
|
+
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
170
|
+
Provides-Extra: wikipedia
|
|
171
|
+
Requires-Dist: wikipedia; extra == "wikipedia"
|
|
172
|
+
Provides-Extra: weaviate
|
|
173
|
+
Requires-Dist: weaviate-client; extra == "weaviate"
|
|
174
|
+
Provides-Extra: databricks-volumes
|
|
175
|
+
Requires-Dist: databricks-sdk; extra == "databricks-volumes"
|
|
176
|
+
Provides-Extra: databricks-delta-tables
|
|
177
|
+
Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
|
|
178
|
+
Provides-Extra: singlestore
|
|
179
|
+
Requires-Dist: singlestoredb; extra == "singlestore"
|
|
197
180
|
Provides-Extra: vectara
|
|
198
181
|
Requires-Dist: requests; extra == "vectara"
|
|
199
182
|
Requires-Dist: httpx; extra == "vectara"
|
|
200
183
|
Requires-Dist: aiofiles; extra == "vectara"
|
|
201
|
-
Provides-Extra:
|
|
202
|
-
Requires-Dist:
|
|
203
|
-
|
|
204
|
-
Requires-Dist:
|
|
205
|
-
Provides-Extra:
|
|
206
|
-
Requires-Dist:
|
|
184
|
+
Provides-Extra: vastdb
|
|
185
|
+
Requires-Dist: pyarrow; extra == "vastdb"
|
|
186
|
+
Requires-Dist: ibis; extra == "vastdb"
|
|
187
|
+
Requires-Dist: vastdb; extra == "vastdb"
|
|
188
|
+
Provides-Extra: embed-huggingface
|
|
189
|
+
Requires-Dist: sentence-transformers; extra == "embed-huggingface"
|
|
190
|
+
Provides-Extra: embed-octoai
|
|
191
|
+
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
192
|
+
Requires-Dist: openai; extra == "embed-octoai"
|
|
193
|
+
Provides-Extra: embed-vertexai
|
|
194
|
+
Requires-Dist: vertexai; extra == "embed-vertexai"
|
|
195
|
+
Provides-Extra: embed-voyageai
|
|
196
|
+
Requires-Dist: voyageai; extra == "embed-voyageai"
|
|
197
|
+
Provides-Extra: embed-mixedbreadai
|
|
198
|
+
Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
|
|
199
|
+
Provides-Extra: openai
|
|
200
|
+
Requires-Dist: tiktoken; extra == "openai"
|
|
201
|
+
Requires-Dist: openai; extra == "openai"
|
|
202
|
+
Provides-Extra: bedrock
|
|
203
|
+
Requires-Dist: aioboto3; extra == "bedrock"
|
|
204
|
+
Requires-Dist: boto3; extra == "bedrock"
|
|
205
|
+
Provides-Extra: togetherai
|
|
206
|
+
Requires-Dist: together; extra == "togetherai"
|
|
207
|
+
Dynamic: author
|
|
208
|
+
Dynamic: author-email
|
|
209
|
+
Dynamic: classifier
|
|
210
|
+
Dynamic: description
|
|
211
|
+
Dynamic: description-content-type
|
|
212
|
+
Dynamic: home-page
|
|
213
|
+
Dynamic: keywords
|
|
214
|
+
Dynamic: license
|
|
215
|
+
Dynamic: provides-extra
|
|
216
|
+
Dynamic: requires-dist
|
|
217
|
+
Dynamic: requires-python
|
|
218
|
+
Dynamic: summary
|
|
207
219
|
|
|
208
220
|
# Unstructured Ingest
|
|
209
221
|
|