unstructured-ingest 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -8,9 +8,9 @@ from contextlib import asynccontextmanager
8
8
  from dataclasses import dataclass
9
9
  from enum import Enum
10
10
  from pathlib import Path
11
- from typing import TYPE_CHECKING, Any, AsyncGenerator, Optional
11
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Literal, Optional
12
12
 
13
- from pydantic import BaseModel, ConfigDict, Field, Secret
13
+ from pydantic import BaseModel, ConfigDict, Field, Secret, field_validator
14
14
 
15
15
  from unstructured_ingest.error import DestinationConnectionError
16
16
  from unstructured_ingest.logger import logger
@@ -30,6 +30,8 @@ from unstructured_ingest.v2.processes.connector_registry import (
30
30
  DestinationRegistryEntry,
31
31
  )
32
32
 
33
+ SimilarityFunction = Literal["cosine"]
34
+
33
35
  if TYPE_CHECKING:
34
36
  from neo4j import AsyncDriver, Auth
35
37
  from networkx import Graph, MultiDiGraph
@@ -44,9 +46,9 @@ class Neo4jAccessConfig(AccessConfig):
44
46
  class Neo4jConnectionConfig(ConnectionConfig):
45
47
  access_config: Secret[Neo4jAccessConfig]
46
48
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
47
- username: str
49
+ username: str = Field(default="neo4j")
48
50
  uri: str = Field(description="Neo4j Connection URI <scheme>://<host>:<port>")
49
- database: str = Field(description="Name of the target database")
51
+ database: str = Field(default="neo4j", description="Name of the target database")
50
52
 
51
53
  @requires_dependencies(["neo4j"], extras="neo4j")
52
54
  @asynccontextmanager
@@ -186,8 +188,8 @@ class _GraphData(BaseModel):
186
188
  nodes = list(nx_graph.nodes())
187
189
  edges = [
188
190
  _Edge(
189
- source_id=u.id_,
190
- destination_id=v.id_,
191
+ source=u,
192
+ destination=v,
191
193
  relationship=Relationship(data_dict["relationship"]),
192
194
  )
193
195
  for u, v, data_dict in nx_graph.edges(data=True)
@@ -198,19 +200,30 @@ class _GraphData(BaseModel):
198
200
  class _Node(BaseModel):
199
201
  model_config = ConfigDict()
200
202
 
201
- id_: str = Field(default_factory=lambda: str(uuid.uuid4()))
202
- labels: list[Label] = Field(default_factory=list)
203
+ labels: list[Label]
203
204
  properties: dict = Field(default_factory=dict)
205
+ id_: str = Field(default_factory=lambda: str(uuid.uuid4()))
204
206
 
205
207
  def __hash__(self):
206
208
  return hash(self.id_)
207
209
 
210
+ @property
211
+ def main_label(self) -> Label:
212
+ return self.labels[0]
213
+
214
+ @classmethod
215
+ @field_validator("labels", mode="after")
216
+ def require_at_least_one_label(cls, value: list[Label]) -> list[Label]:
217
+ if not value:
218
+ raise ValueError("Node must have at least one label.")
219
+ return value
220
+
208
221
 
209
222
  class _Edge(BaseModel):
210
223
  model_config = ConfigDict()
211
224
 
212
- source_id: str
213
- destination_id: str
225
+ source: _Node
226
+ destination: _Node
214
227
  relationship: Relationship
215
228
 
216
229
 
@@ -229,7 +242,14 @@ class Relationship(Enum):
229
242
 
230
243
  class Neo4jUploaderConfig(UploaderConfig):
231
244
  batch_size: int = Field(
232
- default=100, description="Maximal number of nodes/relationships created per transaction."
245
+ default=1000, description="Maximal number of nodes/relationships created per transaction."
246
+ )
247
+ similarity_function: SimilarityFunction = Field(
248
+ default="cosine",
249
+ description="Vector similarity function used to create index on Chunk nodes",
250
+ )
251
+ create_destination: bool = Field(
252
+ default=True, description="Create destination if it does not exist"
233
253
  )
234
254
 
235
255
 
@@ -257,6 +277,13 @@ class Neo4jUploader(Uploader):
257
277
  graph_data = _GraphData.model_validate(staged_data)
258
278
  async with self.connection_config.get_client() as client:
259
279
  await self._create_uniqueness_constraints(client)
280
+ embedding_dimensions = self._get_embedding_dimensions(graph_data)
281
+ if embedding_dimensions and self.upload_config.create_destination:
282
+ await self._create_vector_index(
283
+ client,
284
+ dimensions=embedding_dimensions,
285
+ similarity_function=self.upload_config.similarity_function,
286
+ )
260
287
  await self._delete_old_data_if_exists(file_data, client=client)
261
288
  await self._merge_graph(graph_data=graph_data, client=client)
262
289
 
@@ -274,13 +301,33 @@ class Neo4jUploader(Uploader):
274
301
  """
275
302
  )
276
303
 
304
+ async def _create_vector_index(
305
+ self, client: AsyncDriver, dimensions: int, similarity_function: SimilarityFunction
306
+ ) -> None:
307
+ label = Label.CHUNK
308
+ logger.info(
309
+ f"Creating index on nodes labeled '{label.value}' if it does not already exist."
310
+ )
311
+ index_name = f"{label.value.lower()}_vector"
312
+ await client.execute_query(
313
+ f"""
314
+ CREATE VECTOR INDEX {index_name} IF NOT EXISTS
315
+ FOR (n:{label.value}) ON n.embedding
316
+ OPTIONS {{indexConfig: {{
317
+ `vector.similarity_function`: '{similarity_function}',
318
+ `vector.dimensions`: {dimensions}}}
319
+ }}
320
+ """
321
+ )
322
+
277
323
  async def _delete_old_data_if_exists(self, file_data: FileData, client: AsyncDriver) -> None:
278
324
  logger.info(f"Deleting old data for the record '{file_data.identifier}' (if present).")
279
325
  _, summary, _ = await client.execute_query(
280
326
  f"""
281
- MATCH (n: {Label.DOCUMENT.value} {{id: $identifier}})
282
- MATCH (n)--(m: {Label.CHUNK.value}|{Label.UNSTRUCTURED_ELEMENT.value})
283
- DETACH DELETE m""",
327
+ MATCH (n: `{Label.DOCUMENT.value}` {{id: $identifier}})
328
+ MATCH (n)--(m: `{Label.CHUNK.value}`|`{Label.UNSTRUCTURED_ELEMENT.value}`)
329
+ DETACH DELETE m
330
+ DETACH DELETE n""",
284
331
  identifier=file_data.identifier,
285
332
  )
286
333
  logger.info(
@@ -289,16 +336,15 @@ class Neo4jUploader(Uploader):
289
336
  )
290
337
 
291
338
  async def _merge_graph(self, graph_data: _GraphData, client: AsyncDriver) -> None:
292
- nodes_by_labels: defaultdict[tuple[Label, ...], list[_Node]] = defaultdict(list)
339
+ nodes_by_labels: defaultdict[Label, list[_Node]] = defaultdict(list)
293
340
  for node in graph_data.nodes:
294
- nodes_by_labels[tuple(node.labels)].append(node)
295
-
341
+ nodes_by_labels[node.main_label].append(node)
296
342
  logger.info(f"Merging {len(graph_data.nodes)} graph nodes.")
297
343
  # NOTE: Processed in parallel as there's no overlap between accessed nodes
298
344
  await self._execute_queries(
299
345
  [
300
- self._create_nodes_query(nodes_batch, labels)
301
- for labels, nodes in nodes_by_labels.items()
346
+ self._create_nodes_query(nodes_batch, label)
347
+ for label, nodes in nodes_by_labels.items()
302
348
  for nodes_batch in batch_generator(nodes, batch_size=self.upload_config.batch_size)
303
349
  ],
304
350
  client=client,
@@ -306,16 +352,23 @@ class Neo4jUploader(Uploader):
306
352
  )
307
353
  logger.info(f"Finished merging {len(graph_data.nodes)} graph nodes.")
308
354
 
309
- edges_by_relationship: defaultdict[Relationship, list[_Edge]] = defaultdict(list)
355
+ edges_by_relationship: defaultdict[tuple[Relationship, Label, Label], list[_Edge]] = (
356
+ defaultdict(list)
357
+ )
310
358
  for edge in graph_data.edges:
311
- edges_by_relationship[edge.relationship].append(edge)
359
+ key = (edge.relationship, edge.source.main_label, edge.destination.main_label)
360
+ edges_by_relationship[key].append(edge)
312
361
 
313
362
  logger.info(f"Merging {len(graph_data.edges)} graph relationships (edges).")
314
363
  # NOTE: Processed sequentially to avoid queries locking node access to one another
315
364
  await self._execute_queries(
316
365
  [
317
- self._create_edges_query(edges_batch, relationship)
318
- for relationship, edges in edges_by_relationship.items()
366
+ self._create_edges_query(edges_batch, relationship, source_label, destination_label)
367
+ for (
368
+ relationship,
369
+ source_label,
370
+ destination_label,
371
+ ), edges in edges_by_relationship.items()
319
372
  for edges_batch in batch_generator(edges, batch_size=self.upload_config.batch_size)
320
373
  ],
321
374
  client=client,
@@ -328,53 +381,86 @@ class Neo4jUploader(Uploader):
328
381
  client: AsyncDriver,
329
382
  in_parallel: bool = False,
330
383
  ) -> None:
384
+ from neo4j import EagerResult
385
+
386
+ results: list[EagerResult] = []
387
+ logger.info(
388
+ f"Executing {len(queries_with_parameters)} "
389
+ + f"{'parallel' if in_parallel else 'sequential'} Cypher statements."
390
+ )
331
391
  if in_parallel:
332
- logger.info(f"Executing {len(queries_with_parameters)} queries in parallel.")
333
- await asyncio.gather(
392
+ results = await asyncio.gather(
334
393
  *[
335
394
  client.execute_query(query, parameters_=parameters)
336
395
  for query, parameters in queries_with_parameters
337
396
  ]
338
397
  )
339
- logger.info("Finished executing parallel queries.")
340
398
  else:
341
- logger.info(f"Executing {len(queries_with_parameters)} queries sequentially.")
342
399
  for i, (query, parameters) in enumerate(queries_with_parameters):
343
- logger.info(f"Query #{i} started.")
344
- await client.execute_query(query, parameters_=parameters)
345
- logger.info(f"Query #{i} finished.")
346
- logger.info(
347
- f"Finished executing all ({len(queries_with_parameters)}) sequential queries."
348
- )
400
+ logger.info(f"Statement #{i} started.")
401
+ results.append(await client.execute_query(query, parameters_=parameters))
402
+ logger.info(f"Statement #{i} finished.")
403
+ nodeCount = sum([res.summary.counters.nodes_created for res in results])
404
+ relCount = sum([res.summary.counters.relationships_created for res in results])
405
+ logger.info(
406
+ f"Finished executing all ({len(queries_with_parameters)}) "
407
+ + f"{'parallel' if in_parallel else 'sequential'} Cypher statements. "
408
+ + f"Created {nodeCount} nodes, {relCount} relationships."
409
+ )
349
410
 
350
411
  @staticmethod
351
- def _create_nodes_query(nodes: list[_Node], labels: tuple[Label, ...]) -> tuple[str, dict]:
352
- labels_string = ", ".join([label.value for label in labels])
353
- logger.info(f"Preparing MERGE query for {len(nodes)} nodes labeled '{labels_string}'.")
412
+ def _create_nodes_query(nodes: list[_Node], label: Label) -> tuple[str, dict]:
413
+ logger.info(f"Preparing MERGE query for {len(nodes)} nodes labeled '{label}'.")
354
414
  query_string = f"""
355
415
  UNWIND $nodes AS node
356
- MERGE (n: {labels_string} {{id: node.id}})
416
+ MERGE (n: `{label.value}` {{id: node.id}})
357
417
  SET n += node.properties
418
+ SET n:$(node.labels)
419
+ WITH * WHERE node.vector IS NOT NULL
420
+ CALL db.create.setNodeVectorProperty(n, 'embedding', node.vector)
358
421
  """
359
- parameters = {"nodes": [{"id": node.id_, "properties": node.properties} for node in nodes]}
422
+ parameters = {
423
+ "nodes": [
424
+ {
425
+ "id": node.id_,
426
+ "labels": [l.value for l in node.labels if l != label], # noqa: E741
427
+ "vector": node.properties.pop("embedding", None),
428
+ "properties": node.properties,
429
+ }
430
+ for node in nodes
431
+ ]
432
+ }
360
433
  return query_string, parameters
361
434
 
362
435
  @staticmethod
363
- def _create_edges_query(edges: list[_Edge], relationship: Relationship) -> tuple[str, dict]:
436
+ def _create_edges_query(
437
+ edges: list[_Edge],
438
+ relationship: Relationship,
439
+ source_label: Label,
440
+ destination_label: Label,
441
+ ) -> tuple[str, dict]:
364
442
  logger.info(f"Preparing MERGE query for {len(edges)} {relationship} relationships.")
365
443
  query_string = f"""
366
444
  UNWIND $edges AS edge
367
- MATCH (u {{id: edge.source}})
368
- MATCH (v {{id: edge.destination}})
369
- MERGE (u)-[:{relationship.value}]->(v)
445
+ MATCH (u: `{source_label.value}` {{id: edge.source}})
446
+ MATCH (v: `{destination_label.value}` {{id: edge.destination}})
447
+ MERGE (u)-[:`{relationship.value}`]->(v)
370
448
  """
371
449
  parameters = {
372
450
  "edges": [
373
- {"source": edge.source_id, "destination": edge.destination_id} for edge in edges
451
+ {"source": edge.source.id_, "destination": edge.destination.id_} for edge in edges
374
452
  ]
375
453
  }
376
454
  return query_string, parameters
377
455
 
456
+ def _get_embedding_dimensions(self, graph_data: _GraphData) -> int | None:
457
+ """Embedding dimensions inferred from chunk nodes or None if it can't be determined."""
458
+ for node in graph_data.nodes:
459
+ if Label.CHUNK in node.labels and "embeddings" in node.properties:
460
+ return len(node.properties["embeddings"])
461
+
462
+ return None
463
+
378
464
 
379
465
  neo4j_destination_entry = DestinationRegistryEntry(
380
466
  connection_config=Neo4jConnectionConfig,
@@ -31,6 +31,7 @@ if TYPE_CHECKING:
31
31
  from office365.onedrive.driveitems.driveItem import DriveItem
32
32
 
33
33
  CONNECTOR_TYPE = "sharepoint"
34
+ LEGACY_DEFAULT_PATH = "Shared Documents"
34
35
 
35
36
 
36
37
  class SharepointAccessConfig(OnedriveAccessConfig):
@@ -76,10 +77,14 @@ class SharepointIndexer(OnedriveIndexer):
76
77
  except ClientRequestException:
77
78
  logger.info("Site not found")
78
79
 
79
- drive_items = await self.list_objects(
80
- folder=site_drive_item, recursive=self.index_config.recursive
81
- )
82
- for drive_item in drive_items:
80
+ path = self.index_config.path
81
+ # Deprecated sharepoint sdk needed a default path. Microsoft Graph SDK does not.
82
+ if path and path != LEGACY_DEFAULT_PATH:
83
+ site_drive_item = site_drive_item.get_by_path(path).get().execute_query()
84
+
85
+ for drive_item in site_drive_item.get_files(
86
+ recursive=self.index_config.recursive
87
+ ).execute_query():
83
88
  file_data = await self.drive_item_to_file_data(drive_item=drive_item)
84
89
  yield file_data
85
90
 
@@ -92,18 +92,20 @@ class EmbedderConfig(BaseModel):
92
92
 
93
93
  return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig.model_validate(embedding_kwargs))
94
94
 
95
- def get_bedrock_embedder(self) -> "BaseEmbeddingEncoder":
95
+ def get_bedrock_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
96
96
  from unstructured_ingest.embed.bedrock import (
97
97
  BedrockEmbeddingConfig,
98
98
  BedrockEmbeddingEncoder,
99
99
  )
100
100
 
101
+ embedding_kwargs = embedding_kwargs | {
102
+ "aws_access_key_id": self.embedding_aws_access_key_id,
103
+ "aws_secret_access_key": self.embedding_aws_secret_access_key.get_secret_value(),
104
+ "region_name": self.embedding_aws_region,
105
+ }
106
+
101
107
  return BedrockEmbeddingEncoder(
102
- config=BedrockEmbeddingConfig(
103
- aws_access_key_id=self.embedding_aws_access_key_id,
104
- aws_secret_access_key=self.embedding_aws_secret_access_key.get_secret_value(),
105
- region_name=self.embedding_aws_region,
106
- )
108
+ config=BedrockEmbeddingConfig.model_validate(embedding_kwargs)
107
109
  )
108
110
 
109
111
  def get_vertexai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
@@ -163,7 +165,7 @@ class EmbedderConfig(BaseModel):
163
165
  return self.get_octoai_embedder(embedding_kwargs=kwargs)
164
166
 
165
167
  if self.embedding_provider == "bedrock":
166
- return self.get_bedrock_embedder()
168
+ return self.get_bedrock_embedder(embedding_kwargs=kwargs)
167
169
 
168
170
  if self.embedding_provider == "vertexai":
169
171
  return self.get_vertexai_embedder(embedding_kwargs=kwargs)
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: unstructured-ingest
3
- Version: 0.5.2
3
+ Version: 0.5.4
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,13 +22,45 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
+ Requires-Dist: opentelemetry-sdk
25
26
  Requires-Dist: pandas
26
- Requires-Dist: pydantic>=2.7
27
- Requires-Dist: dataclasses-json
28
27
  Requires-Dist: python-dateutil
29
- Requires-Dist: opentelemetry-sdk
30
- Requires-Dist: click
28
+ Requires-Dist: dataclasses_json
31
29
  Requires-Dist: tqdm
30
+ Requires-Dist: pydantic>=2.7
31
+ Requires-Dist: click
32
+ Provides-Extra: remote
33
+ Requires-Dist: unstructured-client>=0.26.1; extra == "remote"
34
+ Provides-Extra: csv
35
+ Requires-Dist: unstructured[tsv]; extra == "csv"
36
+ Provides-Extra: doc
37
+ Requires-Dist: unstructured[docx]; extra == "doc"
38
+ Provides-Extra: docx
39
+ Requires-Dist: unstructured[docx]; extra == "docx"
40
+ Provides-Extra: epub
41
+ Requires-Dist: unstructured[epub]; extra == "epub"
42
+ Provides-Extra: md
43
+ Requires-Dist: unstructured[md]; extra == "md"
44
+ Provides-Extra: msg
45
+ Requires-Dist: unstructured[msg]; extra == "msg"
46
+ Provides-Extra: odt
47
+ Requires-Dist: unstructured[odt]; extra == "odt"
48
+ Provides-Extra: org
49
+ Requires-Dist: unstructured[org]; extra == "org"
50
+ Provides-Extra: pdf
51
+ Requires-Dist: unstructured[pdf]; extra == "pdf"
52
+ Provides-Extra: ppt
53
+ Requires-Dist: unstructured[pptx]; extra == "ppt"
54
+ Provides-Extra: pptx
55
+ Requires-Dist: unstructured[pptx]; extra == "pptx"
56
+ Provides-Extra: rtf
57
+ Requires-Dist: unstructured[rtf]; extra == "rtf"
58
+ Provides-Extra: rst
59
+ Requires-Dist: unstructured[rst]; extra == "rst"
60
+ Provides-Extra: tsv
61
+ Requires-Dist: unstructured[tsv]; extra == "tsv"
62
+ Provides-Extra: xlsx
63
+ Requires-Dist: unstructured[xlsx]; extra == "xlsx"
32
64
  Provides-Extra: airtable
33
65
  Requires-Dist: pyairtable; extra == "airtable"
34
66
  Provides-Extra: astradb
@@ -38,63 +70,37 @@ Requires-Dist: fsspec; extra == "azure"
38
70
  Requires-Dist: adlfs; extra == "azure"
39
71
  Provides-Extra: azure-ai-search
40
72
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
41
- Provides-Extra: bedrock
42
- Requires-Dist: boto3; extra == "bedrock"
43
- Requires-Dist: aioboto3; extra == "bedrock"
44
73
  Provides-Extra: biomed
45
74
  Requires-Dist: requests; extra == "biomed"
46
75
  Requires-Dist: bs4; extra == "biomed"
47
76
  Provides-Extra: box
48
- Requires-Dist: fsspec; extra == "box"
49
77
  Requires-Dist: boxfs; extra == "box"
78
+ Requires-Dist: fsspec; extra == "box"
50
79
  Provides-Extra: chroma
51
80
  Requires-Dist: chromadb; extra == "chroma"
52
81
  Provides-Extra: clarifai
53
82
  Requires-Dist: clarifai; extra == "clarifai"
54
83
  Provides-Extra: confluence
55
- Requires-Dist: atlassian-python-api; extra == "confluence"
56
84
  Requires-Dist: requests; extra == "confluence"
85
+ Requires-Dist: atlassian-python-api; extra == "confluence"
57
86
  Provides-Extra: couchbase
58
87
  Requires-Dist: couchbase; extra == "couchbase"
59
- Provides-Extra: csv
60
- Requires-Dist: unstructured[tsv]; extra == "csv"
61
- Provides-Extra: databricks-delta-tables
62
- Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
63
- Provides-Extra: databricks-volumes
64
- Requires-Dist: databricks-sdk; extra == "databricks-volumes"
65
88
  Provides-Extra: delta-table
66
- Requires-Dist: boto3; extra == "delta-table"
67
89
  Requires-Dist: deltalake; extra == "delta-table"
90
+ Requires-Dist: boto3; extra == "delta-table"
68
91
  Provides-Extra: discord
69
92
  Requires-Dist: discord.py; extra == "discord"
70
- Provides-Extra: doc
71
- Requires-Dist: unstructured[docx]; extra == "doc"
72
- Provides-Extra: docx
73
- Requires-Dist: unstructured[docx]; extra == "docx"
74
93
  Provides-Extra: dropbox
75
- Requires-Dist: dropboxdrivefs; extra == "dropbox"
76
94
  Requires-Dist: fsspec; extra == "dropbox"
95
+ Requires-Dist: dropboxdrivefs; extra == "dropbox"
77
96
  Provides-Extra: duckdb
78
97
  Requires-Dist: duckdb; extra == "duckdb"
79
98
  Provides-Extra: elasticsearch
80
99
  Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
81
- Provides-Extra: embed-huggingface
82
- Requires-Dist: sentence-transformers; extra == "embed-huggingface"
83
- Provides-Extra: embed-mixedbreadai
84
- Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
85
- Provides-Extra: embed-octoai
86
- Requires-Dist: openai; extra == "embed-octoai"
87
- Requires-Dist: tiktoken; extra == "embed-octoai"
88
- Provides-Extra: embed-vertexai
89
- Requires-Dist: vertexai; extra == "embed-vertexai"
90
- Provides-Extra: embed-voyageai
91
- Requires-Dist: voyageai; extra == "embed-voyageai"
92
- Provides-Extra: epub
93
- Requires-Dist: unstructured[epub]; extra == "epub"
94
100
  Provides-Extra: gcs
101
+ Requires-Dist: gcsfs; extra == "gcs"
95
102
  Requires-Dist: fsspec; extra == "gcs"
96
103
  Requires-Dist: bs4; extra == "gcs"
97
- Requires-Dist: gcsfs; extra == "gcs"
98
104
  Provides-Extra: github
99
105
  Requires-Dist: pygithub>1.58.0; extra == "github"
100
106
  Requires-Dist: requests; extra == "github"
@@ -113,97 +119,103 @@ Provides-Extra: kdbai
113
119
  Requires-Dist: kdbai-client>=1.4.0; extra == "kdbai"
114
120
  Provides-Extra: lancedb
115
121
  Requires-Dist: lancedb; extra == "lancedb"
116
- Provides-Extra: md
117
- Requires-Dist: unstructured[md]; extra == "md"
118
122
  Provides-Extra: milvus
119
123
  Requires-Dist: pymilvus; extra == "milvus"
120
124
  Provides-Extra: mongodb
121
125
  Requires-Dist: pymongo; extra == "mongodb"
122
- Provides-Extra: msg
123
- Requires-Dist: unstructured[msg]; extra == "msg"
124
126
  Provides-Extra: neo4j
125
- Requires-Dist: networkx; extra == "neo4j"
126
127
  Requires-Dist: cymple; extra == "neo4j"
127
- Requires-Dist: neo4j; extra == "neo4j"
128
+ Requires-Dist: networkx; extra == "neo4j"
129
+ Requires-Dist: neo4j-rust-ext; extra == "neo4j"
128
130
  Provides-Extra: notion
129
- Requires-Dist: htmlBuilder; extra == "notion"
130
- Requires-Dist: backoff; extra == "notion"
131
- Requires-Dist: notion-client; extra == "notion"
132
131
  Requires-Dist: httpx; extra == "notion"
133
- Provides-Extra: odt
134
- Requires-Dist: unstructured[odt]; extra == "odt"
132
+ Requires-Dist: notion-client; extra == "notion"
133
+ Requires-Dist: backoff; extra == "notion"
134
+ Requires-Dist: htmlBuilder; extra == "notion"
135
135
  Provides-Extra: onedrive
136
136
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
137
- Requires-Dist: bs4; extra == "onedrive"
138
137
  Requires-Dist: msal; extra == "onedrive"
139
- Provides-Extra: openai
140
- Requires-Dist: openai; extra == "openai"
141
- Requires-Dist: tiktoken; extra == "openai"
138
+ Requires-Dist: bs4; extra == "onedrive"
142
139
  Provides-Extra: opensearch
143
140
  Requires-Dist: opensearch-py; extra == "opensearch"
144
- Provides-Extra: org
145
- Requires-Dist: unstructured[org]; extra == "org"
146
141
  Provides-Extra: outlook
147
142
  Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
148
143
  Requires-Dist: msal; extra == "outlook"
149
- Provides-Extra: pdf
150
- Requires-Dist: unstructured[pdf]; extra == "pdf"
151
144
  Provides-Extra: pinecone
152
145
  Requires-Dist: pinecone-client>=3.7.1; extra == "pinecone"
153
146
  Provides-Extra: postgres
154
147
  Requires-Dist: psycopg2-binary; extra == "postgres"
155
- Provides-Extra: ppt
156
- Requires-Dist: unstructured[pptx]; extra == "ppt"
157
- Provides-Extra: pptx
158
- Requires-Dist: unstructured[pptx]; extra == "pptx"
159
148
  Provides-Extra: qdrant
160
149
  Requires-Dist: qdrant-client; extra == "qdrant"
161
150
  Provides-Extra: reddit
162
151
  Requires-Dist: praw; extra == "reddit"
163
152
  Provides-Extra: redis
164
153
  Requires-Dist: redis; extra == "redis"
165
- Provides-Extra: remote
166
- Requires-Dist: unstructured-client>=0.26.1; extra == "remote"
167
- Provides-Extra: rst
168
- Requires-Dist: unstructured[rst]; extra == "rst"
169
- Provides-Extra: rtf
170
- Requires-Dist: unstructured[rtf]; extra == "rtf"
171
154
  Provides-Extra: s3
172
- Requires-Dist: s3fs; extra == "s3"
173
155
  Requires-Dist: fsspec; extra == "s3"
156
+ Requires-Dist: s3fs; extra == "s3"
157
+ Provides-Extra: sharepoint
158
+ Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
159
+ Requires-Dist: msal; extra == "sharepoint"
174
160
  Provides-Extra: salesforce
175
161
  Requires-Dist: simple-salesforce; extra == "salesforce"
176
162
  Provides-Extra: sftp
177
- Requires-Dist: fsspec; extra == "sftp"
178
163
  Requires-Dist: paramiko; extra == "sftp"
179
- Provides-Extra: sharepoint
180
- Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
181
- Requires-Dist: msal; extra == "sharepoint"
182
- Provides-Extra: singlestore
183
- Requires-Dist: singlestoredb; extra == "singlestore"
164
+ Requires-Dist: fsspec; extra == "sftp"
184
165
  Provides-Extra: slack
185
- Requires-Dist: slack-sdk[optional]; extra == "slack"
166
+ Requires-Dist: slack_sdk[optional]; extra == "slack"
186
167
  Provides-Extra: snowflake
187
- Requires-Dist: psycopg2-binary; extra == "snowflake"
188
168
  Requires-Dist: snowflake-connector-python; extra == "snowflake"
189
- Provides-Extra: togetherai
190
- Requires-Dist: together; extra == "togetherai"
191
- Provides-Extra: tsv
192
- Requires-Dist: unstructured[tsv]; extra == "tsv"
193
- Provides-Extra: vastdb
194
- Requires-Dist: vastdb; extra == "vastdb"
195
- Requires-Dist: pyarrow; extra == "vastdb"
196
- Requires-Dist: ibis; extra == "vastdb"
169
+ Requires-Dist: psycopg2-binary; extra == "snowflake"
170
+ Provides-Extra: wikipedia
171
+ Requires-Dist: wikipedia; extra == "wikipedia"
172
+ Provides-Extra: weaviate
173
+ Requires-Dist: weaviate-client; extra == "weaviate"
174
+ Provides-Extra: databricks-volumes
175
+ Requires-Dist: databricks-sdk; extra == "databricks-volumes"
176
+ Provides-Extra: databricks-delta-tables
177
+ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
178
+ Provides-Extra: singlestore
179
+ Requires-Dist: singlestoredb; extra == "singlestore"
197
180
  Provides-Extra: vectara
198
181
  Requires-Dist: requests; extra == "vectara"
199
182
  Requires-Dist: httpx; extra == "vectara"
200
183
  Requires-Dist: aiofiles; extra == "vectara"
201
- Provides-Extra: weaviate
202
- Requires-Dist: weaviate-client; extra == "weaviate"
203
- Provides-Extra: wikipedia
204
- Requires-Dist: wikipedia; extra == "wikipedia"
205
- Provides-Extra: xlsx
206
- Requires-Dist: unstructured[xlsx]; extra == "xlsx"
184
+ Provides-Extra: vastdb
185
+ Requires-Dist: pyarrow; extra == "vastdb"
186
+ Requires-Dist: ibis; extra == "vastdb"
187
+ Requires-Dist: vastdb; extra == "vastdb"
188
+ Provides-Extra: embed-huggingface
189
+ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
190
+ Provides-Extra: embed-octoai
191
+ Requires-Dist: tiktoken; extra == "embed-octoai"
192
+ Requires-Dist: openai; extra == "embed-octoai"
193
+ Provides-Extra: embed-vertexai
194
+ Requires-Dist: vertexai; extra == "embed-vertexai"
195
+ Provides-Extra: embed-voyageai
196
+ Requires-Dist: voyageai; extra == "embed-voyageai"
197
+ Provides-Extra: embed-mixedbreadai
198
+ Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
199
+ Provides-Extra: openai
200
+ Requires-Dist: tiktoken; extra == "openai"
201
+ Requires-Dist: openai; extra == "openai"
202
+ Provides-Extra: bedrock
203
+ Requires-Dist: aioboto3; extra == "bedrock"
204
+ Requires-Dist: boto3; extra == "bedrock"
205
+ Provides-Extra: togetherai
206
+ Requires-Dist: together; extra == "togetherai"
207
+ Dynamic: author
208
+ Dynamic: author-email
209
+ Dynamic: classifier
210
+ Dynamic: description
211
+ Dynamic: description-content-type
212
+ Dynamic: home-page
213
+ Dynamic: keywords
214
+ Dynamic: license
215
+ Dynamic: provides-extra
216
+ Dynamic: requires-dist
217
+ Dynamic: requires-python
218
+ Dynamic: summary
207
219
 
208
220
  # Unstructured Ingest
209
221