unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (123) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +57 -13
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -23
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/connector.py +5 -7
  34. unstructured_ingest/v2/interfaces/downloader.py +8 -5
  35. unstructured_ingest/v2/interfaces/file_data.py +8 -2
  36. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  37. unstructured_ingest/v2/interfaces/processor.py +10 -10
  38. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  39. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  40. unstructured_ingest/v2/pipeline/pipeline.py +1 -5
  41. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  42. unstructured_ingest/v2/pipeline/steps/download.py +13 -11
  43. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  44. unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
  45. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  46. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  47. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  48. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  49. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  50. unstructured_ingest/v2/processes/__init__.py +18 -0
  51. unstructured_ingest/v2/processes/chunker.py +74 -28
  52. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  53. unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
  54. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +45 -35
  55. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
  56. unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
  57. unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
  58. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
  59. unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
  60. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
  61. unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
  62. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
  63. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
  64. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
  66. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
  67. unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
  68. unstructured_ingest/v2/processes/connectors/local.py +22 -14
  69. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  70. unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
  71. unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
  72. unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
  73. unstructured_ingest/v2/processes/connectors/pinecone.py +23 -20
  74. unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
  75. unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
  76. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  77. unstructured_ingest/v2/processes/connectors/sql.py +29 -31
  78. unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
  79. unstructured_ingest/v2/processes/embedder.py +106 -47
  80. unstructured_ingest/v2/processes/filter.py +11 -5
  81. unstructured_ingest/v2/processes/partitioner.py +79 -33
  82. unstructured_ingest/v2/processes/uncompress.py +3 -3
  83. unstructured_ingest/v2/utils.py +45 -0
  84. unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
  85. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +89 -116
  86. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
  87. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  88. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  89. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  90. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  91. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  92. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  93. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  94. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  95. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  96. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
  97. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  98. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  99. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  100. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  101. unstructured_ingest/v2/cli/cmds/local.py +0 -52
  102. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  103. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  104. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  105. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  106. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  107. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  108. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  109. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  110. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  111. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  112. unstructured_ingest/v2/cli/configs/__init__.py +0 -13
  113. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  114. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  115. unstructured_ingest/v2/cli/configs/filter.py +0 -28
  116. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  117. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  118. unstructured_ingest/v2/cli/interfaces.py +0 -27
  119. unstructured_ingest/v2/pipeline/utils.py +0 -15
  120. unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
  121. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  122. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
  123. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,15 @@
1
- import enum
2
1
  import json
3
2
  import uuid
4
3
  from dataclasses import dataclass, field
5
4
  from datetime import date, datetime
6
5
  from pathlib import Path
7
- from typing import TYPE_CHECKING, Any, Callable, Optional, Union
6
+ from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
8
7
 
9
8
  import numpy as np
10
9
  import pandas as pd
11
10
  from dateutil import parser
11
+ from pydantic import Field, Secret
12
12
 
13
- from unstructured_ingest.enhanced_dataclass import enhanced_field
14
13
  from unstructured_ingest.error import DestinationConnectionError
15
14
  from unstructured_ingest.utils.dep_check import requires_dependencies
16
15
  from unstructured_ingest.v2.interfaces import (
@@ -33,40 +32,41 @@ if TYPE_CHECKING:
33
32
 
34
33
  CONNECTOR_TYPE = "sql"
35
34
  ELEMENTS_TABLE_NAME = "elements"
35
+ SQLITE_DB = "sqlite"
36
+ POSTGRESQL_DB = "postgresql"
36
37
 
37
38
 
38
- @dataclass
39
39
  class SQLAccessConfig(AccessConfig):
40
- username: Optional[str] = None
41
- password: Optional[str] = None
40
+ username: Optional[str] = Field(default=None, description="DB username")
41
+ password: Optional[str] = Field(default=None, description="DB password")
42
42
 
43
43
 
44
- class DatabaseType(str, enum.Enum):
45
- SQLITE = "sqlite"
46
- POSTGRESQL = "postgresql"
44
+ SecreteSQLAccessConfig = Secret[SQLAccessConfig]
47
45
 
48
46
 
49
- @dataclass
50
47
  class SQLConnectionConfig(ConnectionConfig):
51
- db_type: DatabaseType = (
52
- # required default value here because of parent class
53
- DatabaseType.SQLITE
48
+ db_type: Literal["sqlite", "postgresql"] = Field(
49
+ default=SQLITE_DB, description="Type of the database backend"
54
50
  )
55
- database: Optional[str] = None
56
- host: Optional[str] = None
57
- port: Optional[int] = 5432
58
- access_config: Optional[SQLAccessConfig] = enhanced_field(default=None, sensitive=True)
59
- connector_type: str = CONNECTOR_TYPE
51
+ database: Optional[str] = Field(
52
+ default=None,
53
+ description="Database name. For sqlite databases, this is the path to the .db file.",
54
+ )
55
+ host: Optional[str] = Field(default=None, description="DB host")
56
+ port: Optional[int] = Field(default=5432, description="DB host connection port")
57
+ access_config: SecreteSQLAccessConfig = Field(
58
+ default_factory=lambda: SecreteSQLAccessConfig(secret_value=SQLAccessConfig())
59
+ )
60
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
60
61
 
61
62
  def __post_init__(self):
62
- if (self.db_type == DatabaseType.SQLITE) and (self.database is None):
63
+ if (self.db_type == SQLITE_DB) and (self.database is None):
63
64
  raise ValueError(
64
65
  "A sqlite connection requires a path to a *.db file "
65
66
  "through the `database` argument"
66
67
  )
67
68
 
68
69
 
69
- @dataclass
70
70
  class SQLUploadStagerConfig(UploadStagerConfig):
71
71
  pass
72
72
 
@@ -182,9 +182,8 @@ class SQLUploadStager(UploadStager):
182
182
  return output_path
183
183
 
184
184
 
185
- @dataclass
186
185
  class SQLUploaderConfig(UploaderConfig):
187
- batch_size: int = 50
186
+ batch_size: int = Field(default=50, description="Number of records per batch")
188
187
 
189
188
 
190
189
  @dataclass
@@ -204,9 +203,9 @@ class SQLUploader(Uploader):
204
203
 
205
204
  @property
206
205
  def connection(self) -> Callable[[], Union["SqliteConnection", "PostgresConnection"]]:
207
- if self.connection_config.db_type == DatabaseType.POSTGRESQL:
206
+ if self.connection_config.db_type == POSTGRESQL_DB:
208
207
  return self._make_psycopg_connection
209
- elif self.connection_config.db_type == DatabaseType.SQLITE:
208
+ elif self.connection_config.db_type == SQLITE_DB:
210
209
  return self._make_sqlite_connection
211
210
  raise ValueError(f"Unsupported database {self.connection_config.db_type} connection.")
212
211
 
@@ -219,9 +218,10 @@ class SQLUploader(Uploader):
219
218
  def _make_psycopg_connection(self) -> "PostgresConnection":
220
219
  from psycopg2 import connect
221
220
 
221
+ access_config = self.connection_config.access_config.get_secret_value()
222
222
  return connect(
223
- user=self.connection_config.access_config.username,
224
- password=self.connection_config.access_config.password,
223
+ user=access_config.username,
224
+ password=access_config.password,
225
225
  dbname=self.connection_config.database,
226
226
  host=self.connection_config.host,
227
227
  port=self.connection_config.port,
@@ -234,9 +234,7 @@ class SQLUploader(Uploader):
234
234
  for row in data:
235
235
  parsed = []
236
236
  for column_name, value in zip(columns, row):
237
- if self.connection_config.db_type == DatabaseType.SQLITE and isinstance(
238
- value, (list, dict)
239
- ):
237
+ if self.connection_config.db_type == SQLITE_DB and isinstance(value, (list, dict)):
240
238
  value = json.dumps(value)
241
239
  if column_name in _DATE_COLUMNS:
242
240
  if value is None:
@@ -255,14 +253,14 @@ class SQLUploader(Uploader):
255
253
 
256
254
  columns = tuple(df.columns)
257
255
  stmt = f"INSERT INTO {ELEMENTS_TABLE_NAME} ({','.join(columns)}) \
258
- VALUES({','.join(['?' if self.connection_config.db_type==DatabaseType.SQLITE else '%s' for x in columns])})" # noqa E501
256
+ VALUES({','.join(['?' if self.connection_config.db_type==SQLITE_DB else '%s' for x in columns])})" # noqa E501
259
257
 
260
258
  for rows in pd.read_json(
261
259
  content.path, orient="records", lines=True, chunksize=self.upload_config.batch_size
262
260
  ):
263
261
  with self.connection() as conn:
264
262
  values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
265
- if self.connection_config.db_type == DatabaseType.SQLITE:
263
+ if self.connection_config.db_type == SQLITE_DB:
266
264
  conn.executemany(stmt, values)
267
265
  else:
268
266
  with conn.cursor() as cur:
@@ -5,8 +5,8 @@ from pathlib import Path
5
5
  from typing import TYPE_CHECKING, Any, Optional
6
6
 
7
7
  from dateutil import parser
8
+ from pydantic import Field, Secret
8
9
 
9
- from unstructured_ingest.enhanced_dataclass import enhanced_field
10
10
  from unstructured_ingest.error import DestinationConnectionError
11
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
12
  from unstructured_ingest.v2.interfaces import (
@@ -30,27 +30,37 @@ if TYPE_CHECKING:
30
30
  CONNECTOR_TYPE = "weaviate"
31
31
 
32
32
 
33
- @dataclass
34
33
  class WeaviateAccessConfig(AccessConfig):
35
- access_token: Optional[str] = None
34
+ access_token: Optional[str] = Field(
35
+ default=None, description="Used to create the bearer token."
36
+ )
36
37
  api_key: Optional[str] = None
37
38
  client_secret: Optional[str] = None
38
39
  password: Optional[str] = None
39
40
 
40
41
 
41
- @dataclass
42
+ SecretWeaviateAccessConfig = Secret[WeaviateAccessConfig]
43
+
44
+
42
45
  class WeaviateConnectionConfig(ConnectionConfig):
43
- host_url: str
44
- class_name: str
45
- access_config: WeaviateAccessConfig = enhanced_field(sensitive=True)
46
+ host_url: str = Field(description="Weaviate instance url")
47
+ class_name: str = Field(
48
+ description="Name of the class to push the records into, e.g: Pdf-elements"
49
+ )
50
+ access_config: SecretWeaviateAccessConfig = Field(
51
+ default_factory=lambda: SecretWeaviateAccessConfig(secret_value=WeaviateAccessConfig())
52
+ )
46
53
  username: Optional[str] = None
47
- anonymous: bool = False
54
+ anonymous: bool = Field(default=False, description="if set, all auth values will be ignored")
48
55
  scope: Optional[list[str]] = None
49
- refresh_token: Optional[str] = None
50
- connector_type: str = CONNECTOR_TYPE
56
+ refresh_token: Optional[str] = Field(
57
+ default=None,
58
+ description="Will tie this value to the bearer token. If not provided, "
59
+ "the authentication will expire once the lifetime of the access token is up.",
60
+ )
61
+ connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
51
62
 
52
63
 
53
- @dataclass
54
64
  class WeaviateUploadStagerConfig(UploadStagerConfig):
55
65
  pass
56
66
 
@@ -148,9 +158,8 @@ class WeaviateUploadStager(UploadStager):
148
158
  return output_path
149
159
 
150
160
 
151
- @dataclass
152
161
  class WeaviateUploaderConfig(UploaderConfig):
153
- batch_size: int = 100
162
+ batch_size: int = Field(default=100, description="Number of records per batch")
154
163
 
155
164
 
156
165
  @dataclass
@@ -1,76 +1,135 @@
1
1
  from abc import ABC
2
2
  from dataclasses import dataclass
3
3
  from pathlib import Path
4
- from typing import Any, Optional
4
+ from typing import TYPE_CHECKING, Any, Literal, Optional
5
5
 
6
- from unstructured.documents.elements import Element
7
- from unstructured.embed.interfaces import BaseEmbeddingEncoder
8
- from unstructured.staging.base import elements_from_json
6
+ from pydantic import BaseModel, Field, SecretStr
9
7
 
10
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
11
9
  from unstructured_ingest.v2.interfaces.process import BaseProcess
12
10
 
11
+ if TYPE_CHECKING:
12
+ from unstructured.embed.interfaces import BaseEmbeddingEncoder
13
13
 
14
- @dataclass
15
- class EmbedderConfig(EnhancedDataClassJsonMixin):
16
- embedding_provider: Optional[str] = None
17
- embedding_api_key: Optional[str] = enhanced_field(default=None, sensitive=True)
18
- embedding_model_name: Optional[str] = None
19
- embedding_aws_access_key_id: Optional[str] = None
20
- embedding_aws_secret_access_key: Optional[str] = None
21
- embedding_aws_region: Optional[str] = None
22
-
23
- def get_embedder(self) -> BaseEmbeddingEncoder:
14
+
15
+ class EmbedderConfig(BaseModel):
16
+ embedding_provider: Optional[
17
+ Literal[
18
+ "langchain-openai",
19
+ "langchain-huggingface",
20
+ "langchain-aws-bedrock",
21
+ "langchain-vertexai",
22
+ "langchain-voyageai",
23
+ "octoai",
24
+ ]
25
+ ] = Field(default=None, description="Type of the embedding class to be used.")
26
+ embedding_api_key: Optional[SecretStr] = Field(
27
+ default=None,
28
+ description="API key for the embedding model, for the case an API key is needed.",
29
+ )
30
+ embedding_model_name: Optional[str] = Field(
31
+ default=None,
32
+ description="Embedding model name, if needed. "
33
+ "Chooses a particular LLM between different options, to embed with it.",
34
+ )
35
+ embedding_aws_access_key_id: Optional[str] = Field(
36
+ default=None, description="AWS access key used for AWS-based embedders, such as bedrock"
37
+ )
38
+ embedding_aws_secret_access_key: Optional[SecretStr] = Field(
39
+ default=None, description="AWS secret key used for AWS-based embedders, such as bedrock"
40
+ )
41
+ embedding_aws_region: Optional[str] = Field(
42
+ default="us-west-2", description="AWS region used for AWS-based embedders, such as bedrock"
43
+ )
44
+
45
+ @requires_dependencies(dependencies=["unstructured"], extras="embed-huggingface")
46
+ def get_huggingface_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
47
+ from unstructured.embed.huggingface import (
48
+ HuggingFaceEmbeddingConfig,
49
+ HuggingFaceEmbeddingEncoder,
50
+ )
51
+
52
+ return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**embedding_kwargs))
53
+
54
+ @requires_dependencies(dependencies=["unstructured"], extras="openai")
55
+ def get_openai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
56
+ from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
57
+
58
+ return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(**embedding_kwargs))
59
+
60
+ @requires_dependencies(dependencies=["unstructured"], extras="embed-octoai")
61
+ def get_octoai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
62
+ from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
63
+
64
+ return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**embedding_kwargs))
65
+
66
+ @requires_dependencies(dependencies=["unstructured"], extras="bedrock")
67
+ def get_bedrock_embedder(self) -> "BaseEmbeddingEncoder":
68
+ from unstructured.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
69
+
70
+ return BedrockEmbeddingEncoder(
71
+ config=BedrockEmbeddingConfig(
72
+ aws_access_key_id=self.embedding_aws_access_key_id,
73
+ aws_secret_access_key=self.embedding_aws_secret_access_key.get_secret_value(),
74
+ region_name=self.embedding_aws_region,
75
+ )
76
+ )
77
+
78
+ @requires_dependencies(dependencies=["unstructured"], extras="embed-vertexai")
79
+ def get_vertexai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
80
+ from unstructured.embed.vertexai import (
81
+ VertexAIEmbeddingConfig,
82
+ VertexAIEmbeddingEncoder,
83
+ )
84
+
85
+ return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**embedding_kwargs))
86
+
87
+ @requires_dependencies(dependencies=["unstructured"], extras="embed-voyageai")
88
+ def get_voyageai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
89
+ from unstructured.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
90
+
91
+ return VoyageAIEmbeddingEncoder(config=VoyageAIEmbeddingConfig(**embedding_kwargs))
92
+
93
+ def get_embedder(self) -> "BaseEmbeddingEncoder":
24
94
  kwargs: dict[str, Any] = {}
25
95
  if self.embedding_api_key:
26
- kwargs["api_key"] = self.embedding_api_key
96
+ kwargs["api_key"] = self.embedding_api_key.get_secret_value()
27
97
  if self.embedding_model_name:
28
98
  kwargs["model_name"] = self.embedding_model_name
29
99
  # TODO make this more dynamic to map to encoder configs
30
100
  if self.embedding_provider == "langchain-openai":
31
- from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
101
+ return self.get_openai_embedder(embedding_kwargs=kwargs)
32
102
 
33
- return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(**kwargs))
34
- elif self.embedding_provider == "langchain-huggingface":
35
- from unstructured.embed.huggingface import (
36
- HuggingFaceEmbeddingConfig,
37
- HuggingFaceEmbeddingEncoder,
38
- )
103
+ if self.embedding_provider == "langchain-huggingface":
104
+ return self.get_huggingface_embedder(embedding_kwargs=kwargs)
39
105
 
40
- return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**kwargs))
41
- elif self.embedding_provider == "octoai":
42
- from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
106
+ if self.embedding_provider == "octoai":
107
+ return self.get_octoai_embedder(embedding_kwargs=kwargs)
43
108
 
44
- return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**kwargs))
45
- elif self.embedding_provider == "langchain-aws-bedrock":
46
- from unstructured.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
109
+ if self.embedding_provider == "langchain-aws-bedrock":
110
+ return self.get_bedrock_embedder()
47
111
 
48
- return BedrockEmbeddingEncoder(
49
- config=BedrockEmbeddingConfig(
50
- aws_access_key_id=self.embedding_aws_access_key_id,
51
- aws_secret_access_key=self.embedding_aws_secret_access_key,
52
- region_name=self.embedding_aws_region,
53
- )
54
- )
55
- elif self.embedding_provider == "langchain-vertexai":
56
- from unstructured.embed.vertexai import (
57
- VertexAIEmbeddingConfig,
58
- VertexAIEmbeddingEncoder,
59
- )
112
+ if self.embedding_provider == "langchain-vertexai":
113
+ return self.get_vertexai_embedder(embedding_kwargs=kwargs)
60
114
 
61
- return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**kwargs))
62
- else:
63
- raise ValueError(f"{self.embedding_provider} not a recognized encoder")
115
+ if self.embedding_provider == "langchain-voyageai":
116
+ return self.get_voyageai_embedder(embedding_kwargs=kwargs)
117
+
118
+ raise ValueError(f"{self.embedding_provider} not a recognized encoder")
64
119
 
65
120
 
66
121
  @dataclass
67
122
  class Embedder(BaseProcess, ABC):
68
123
  config: EmbedderConfig
69
124
 
70
- def run(self, elements_filepath: Path, **kwargs: Any) -> list[Element]:
125
+ @requires_dependencies(dependencies=["unstructured"])
126
+ def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
127
+ from unstructured.staging.base import elements_from_json
128
+
71
129
  # TODO update base embedder classes to support async
72
130
  embedder = self.config.get_embedder()
73
131
  elements = elements_from_json(filename=str(elements_filepath))
74
132
  if not elements:
75
- return elements
76
- return embedder.embed_documents(elements=elements)
133
+ return [e.to_dict() for e in elements]
134
+ embedded_elements = embedder.embed_documents(elements=elements)
135
+ return [e.to_dict() for e in embedded_elements]
@@ -3,16 +3,22 @@ from abc import ABC
3
3
  from dataclasses import dataclass, field
4
4
  from typing import Any, Callable, Optional
5
5
 
6
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
6
+ from pydantic import BaseModel, Field
7
+
7
8
  from unstructured_ingest.v2.interfaces import FileData
8
9
  from unstructured_ingest.v2.interfaces.process import BaseProcess
9
10
  from unstructured_ingest.v2.logger import logger
10
11
 
11
12
 
12
- @dataclass
13
- class FiltererConfig(EnhancedDataClassJsonMixin):
14
- file_glob: Optional[list[str]] = None
15
- max_file_size: Optional[int] = None
13
+ class FiltererConfig(BaseModel):
14
+ file_glob: Optional[list[str]] = Field(
15
+ default=None,
16
+ description="file globs to limit which types of " "files are accepted",
17
+ examples=["*.pdf", "*.html"],
18
+ )
19
+ max_file_size: Optional[int] = Field(
20
+ default=None, description="Max file size to process in bytes"
21
+ )
16
22
 
17
23
 
18
24
  @dataclass
@@ -1,14 +1,13 @@
1
1
  import asyncio
2
2
  from abc import ABC
3
- from dataclasses import dataclass, field, fields
3
+ from dataclasses import dataclass, fields
4
4
  from pathlib import Path
5
5
  from typing import TYPE_CHECKING, Any, Optional
6
6
 
7
- from unstructured.documents.elements import DataSourceMetadata
8
- from unstructured.staging.base import elements_to_dicts, flatten_dict
7
+ from pydantic import BaseModel, Field, SecretStr
9
8
 
10
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
11
- from unstructured_ingest.enhanced_dataclass.dataclasses import enhanced_field
9
+ from unstructured_ingest.utils.data_prep import flatten_dict
10
+ from unstructured_ingest.utils.dep_check import requires_dependencies
12
11
  from unstructured_ingest.v2.interfaces.process import BaseProcess
13
12
  from unstructured_ingest.v2.logger import logger
14
13
 
@@ -17,25 +16,65 @@ if TYPE_CHECKING:
17
16
  from unstructured_client.models.shared import PartitionParameters
18
17
 
19
18
 
20
- @dataclass
21
- class PartitionerConfig(EnhancedDataClassJsonMixin):
22
- strategy: str = "auto"
23
- ocr_languages: Optional[list[str]] = None
24
- encoding: Optional[str] = None
25
- additional_partition_args: Optional[dict[str, Any]] = None
26
- skip_infer_table_types: Optional[list[str]] = None
27
- fields_include: list[str] = field(
19
+ class PartitionerConfig(BaseModel):
20
+ strategy: str = Field(
21
+ default="auto",
22
+ description="The method that will be used to process the documents. ",
23
+ examples=["fast", "hi_res", "auto"],
24
+ )
25
+ ocr_languages: Optional[list[str]] = Field(
26
+ default=None,
27
+ description="A list of language packs to specify which languages to use for OCR, "
28
+ "The appropriate Tesseract language pack needs to be installed.",
29
+ examples=["eng", "deu", "eng,deu"],
30
+ )
31
+ encoding: Optional[str] = Field(
32
+ default=None,
33
+ description="Text encoding to use when reading documents. "
34
+ "By default the encoding is detected automatically.",
35
+ )
36
+ additional_partition_args: Optional[dict[str, Any]] = Field(
37
+ default=None, description="Additional values to pass through to partition()"
38
+ )
39
+ skip_infer_table_types: Optional[list[str]] = Field(
40
+ default=None, description="Optional list of document types to skip table extraction on"
41
+ )
42
+ fields_include: list[str] = Field(
28
43
  default_factory=lambda: ["element_id", "text", "type", "metadata", "embeddings"],
44
+ description="If set, include the specified top-level fields in an element.",
45
+ )
46
+ flatten_metadata: bool = Field(
47
+ default=False,
48
+ description="Results in flattened json elements. "
49
+ "Specifically, the metadata key values are brought to "
50
+ "the top-level of the element, and the `metadata` key itself is removed.",
51
+ )
52
+ metadata_exclude: list[str] = Field(
53
+ default_factory=list,
54
+ description="If set, drop the specified metadata " "fields if they exist.",
29
55
  )
30
- flatten_metadata: bool = False
31
- metadata_exclude: list[str] = field(default_factory=list)
32
- metadata_include: list[str] = field(default_factory=list)
33
- partition_endpoint: Optional[str] = "https://api.unstructured.io/general/v0/general"
34
- partition_by_api: bool = False
35
- api_key: Optional[str] = enhanced_field(default=None, sensitive=True)
36
- hi_res_model_name: Optional[str] = None
37
-
38
- def __post_init__(self):
56
+ metadata_include: list[str] = Field(
57
+ default_factory=list,
58
+ description="If set, include the specified metadata "
59
+ "fields if they exist and drop all other fields. ",
60
+ )
61
+ partition_endpoint: Optional[str] = Field(
62
+ default="https://api.unstructured.io/general/v0/general",
63
+ description="If partitioning via api, use the following host.",
64
+ )
65
+ partition_by_api: bool = Field(
66
+ default=False,
67
+ description="Use a remote API to partition the files."
68
+ " Otherwise, use the function from partition.auto",
69
+ )
70
+ api_key: Optional[SecretStr] = Field(
71
+ default=None, description="API Key for partition endpoint."
72
+ )
73
+ hi_res_model_name: Optional[str] = Field(
74
+ default=None, description="Model name for hi-res strategy."
75
+ )
76
+
77
+ def model_post_init(self, __context: Any) -> None:
39
78
  if self.metadata_exclude and self.metadata_include:
40
79
  raise ValueError(
41
80
  "metadata_exclude and metadata_include are "
@@ -93,16 +132,23 @@ class Partitioner(BaseProcess, ABC):
93
132
  elem.update(flatten_dict(metadata, keys_to_omit=["data_source_record_locator"]))
94
133
  return element_dicts
95
134
 
135
+ @requires_dependencies(dependencies=["unstructured"])
96
136
  def partition_locally(
97
- self, filename: Path, metadata: Optional[DataSourceMetadata] = None, **kwargs
137
+ self, filename: Path, metadata: Optional[dict] = None, **kwargs
98
138
  ) -> list[dict]:
139
+ from unstructured.documents.elements import DataSourceMetadata
99
140
  from unstructured.partition.auto import partition
141
+ from unstructured.staging.base import elements_to_dicts
142
+
143
+ @dataclass
144
+ class FileDataSourceMetadata(DataSourceMetadata):
145
+ filesize_bytes: Optional[int] = None
100
146
 
101
147
  logger.debug(f"Using local partition with kwargs: {self.config.to_partition_kwargs()}")
102
- logger.debug(f"partitioning file {filename} with metadata {metadata.to_dict()}")
148
+ logger.debug(f"partitioning file {filename} with metadata {metadata}")
103
149
  elements = partition(
104
150
  filename=str(filename.resolve()),
105
- data_source_metadata=metadata,
151
+ data_source_metadata=FileDataSourceMetadata.from_dict(metadata),
106
152
  **self.config.to_partition_kwargs(),
107
153
  )
108
154
  return self.postprocess(elements=elements_to_dicts(elements))
@@ -138,29 +184,29 @@ class Partitioner(BaseProcess, ABC):
138
184
  partition_params = PartitionParameters(**filtered_partition_request)
139
185
  return partition_params
140
186
 
187
+ @requires_dependencies(dependencies=["unstructured_client"], extras="remote")
141
188
  async def partition_via_api(
142
- self, filename: Path, metadata: Optional[DataSourceMetadata] = None, **kwargs
189
+ self, filename: Path, metadata: Optional[dict] = None, **kwargs
143
190
  ) -> list[dict]:
144
191
  from unstructured_client import UnstructuredClient
145
192
 
146
- logger.debug(f"partitioning file {filename} with metadata: {metadata.to_dict()}")
193
+ logger.debug(f"partitioning file {filename} with metadata: {metadata}")
147
194
  client = UnstructuredClient(
148
- server_url=self.config.partition_endpoint, api_key_auth=self.config.api_key
195
+ server_url=self.config.partition_endpoint,
196
+ api_key_auth=self.config.api_key.get_secret_value(),
149
197
  )
150
198
  partition_params = self.create_partition_parameters(filename=filename)
151
199
  resp = await self.call_api(client=client, request=partition_params)
152
200
  elements = resp.elements or []
153
201
  # Append the data source metadata the auto partition does for you
154
202
  for element in elements:
155
- element["metadata"]["data_source"] = metadata.to_dict()
203
+ element["metadata"]["data_source"] = metadata
156
204
  return self.postprocess(elements=elements)
157
205
 
158
- def run(
159
- self, filename: Path, metadata: Optional[DataSourceMetadata] = None, **kwargs
160
- ) -> list[dict]:
206
+ def run(self, filename: Path, metadata: Optional[dict] = None, **kwargs) -> list[dict]:
161
207
  return self.partition_locally(filename, metadata=metadata, **kwargs)
162
208
 
163
209
  async def run_async(
164
- self, filename: Path, metadata: Optional[DataSourceMetadata] = None, **kwargs
210
+ self, filename: Path, metadata: Optional[dict] = None, **kwargs
165
211
  ) -> list[dict]:
166
212
  return await self.partition_via_api(filename, metadata=metadata, **kwargs)
@@ -4,14 +4,14 @@ from dataclasses import dataclass, field
4
4
  from pathlib import Path
5
5
  from typing import Any
6
6
 
7
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
7
+ from pydantic import BaseModel
8
+
8
9
  from unstructured_ingest.utils.compression import TAR_FILE_EXT, ZIP_FILE_EXT, uncompress_file
9
10
  from unstructured_ingest.v2.interfaces import FileData
10
11
  from unstructured_ingest.v2.interfaces.process import BaseProcess
11
12
 
12
13
 
13
- @dataclass
14
- class UncompressConfig(EnhancedDataClassJsonMixin):
14
+ class UncompressConfig(BaseModel):
15
15
  pass
16
16
 
17
17
 
@@ -0,0 +1,45 @@
1
+ import json
2
+ from datetime import datetime
3
+ from inspect import isclass
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ from pydantic import BaseModel
8
+ from pydantic.types import _SecretBase
9
+
10
+
11
+ def is_secret(value: Any) -> bool:
12
+ # Case Secret[int]
13
+ if hasattr(value, "__origin__") and hasattr(value, "__args__"):
14
+ origin = value.__origin__
15
+ return isclass(origin) and issubclass(origin, _SecretBase)
16
+ # Case SecretStr
17
+ return isclass(value) and issubclass(value, _SecretBase)
18
+
19
+
20
+ def serialize_base_model(model: BaseModel) -> dict:
21
+ # To get the full serialized dict regardless of if values are marked as Secret
22
+ model_dict = model.dict()
23
+ for k, v in model_dict.items():
24
+ if isinstance(v, _SecretBase):
25
+ secret_value = v.get_secret_value()
26
+ if isinstance(secret_value, BaseModel):
27
+ model_dict[k] = serialize_base_model(model=secret_value)
28
+ else:
29
+ model_dict[k] = secret_value
30
+
31
+ return model_dict
32
+
33
+
34
+ def serialize_base_model_json(model: BaseModel, **json_kwargs) -> str:
35
+ model_dict = serialize_base_model(model=model)
36
+
37
+ def json_serial(obj):
38
+ if isinstance(obj, Path):
39
+ return obj.as_posix()
40
+ if isinstance(obj, datetime):
41
+ return obj.isoformat()
42
+ raise TypeError("Type %s not serializable" % type(obj))
43
+
44
+ # Support json dumps kwargs such as sort_keys
45
+ return json.dumps(model_dict, default=json_serial, **json_kwargs)