unstructured-ingest 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (70) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/interfaces.py +1 -1
  3. unstructured_ingest/cli/utils.py +1 -1
  4. unstructured_ingest/connector/astradb.py +1 -1
  5. unstructured_ingest/connector/biomed.py +4 -4
  6. unstructured_ingest/connector/chroma.py +1 -1
  7. unstructured_ingest/connector/databricks_volumes.py +2 -2
  8. unstructured_ingest/connector/fsspec/box.py +1 -1
  9. unstructured_ingest/connector/fsspec/fsspec.py +5 -5
  10. unstructured_ingest/connector/git.py +1 -1
  11. unstructured_ingest/connector/google_drive.py +4 -4
  12. unstructured_ingest/connector/hubspot.py +1 -1
  13. unstructured_ingest/connector/kafka.py +8 -8
  14. unstructured_ingest/connector/local.py +1 -1
  15. unstructured_ingest/connector/notion/helpers.py +4 -4
  16. unstructured_ingest/connector/onedrive.py +3 -3
  17. unstructured_ingest/connector/outlook.py +2 -2
  18. unstructured_ingest/connector/pinecone.py +1 -1
  19. unstructured_ingest/connector/sharepoint.py +8 -8
  20. unstructured_ingest/connector/vectara.py +6 -6
  21. unstructured_ingest/interfaces.py +4 -4
  22. unstructured_ingest/logger.py +1 -1
  23. unstructured_ingest/pipeline/copy.py +1 -1
  24. unstructured_ingest/pipeline/interfaces.py +2 -2
  25. unstructured_ingest/pipeline/partition.py +1 -1
  26. unstructured_ingest/pipeline/pipeline.py +1 -1
  27. unstructured_ingest/pipeline/reformat/chunking.py +2 -2
  28. unstructured_ingest/pipeline/reformat/embedding.py +1 -1
  29. unstructured_ingest/pipeline/source.py +2 -2
  30. unstructured_ingest/utils/compression.py +3 -3
  31. unstructured_ingest/utils/string_and_date_utils.py +2 -2
  32. unstructured_ingest/v2/cli/base/cmd.py +3 -3
  33. unstructured_ingest/v2/cli/base/dest.py +1 -1
  34. unstructured_ingest/v2/cli/base/src.py +1 -1
  35. unstructured_ingest/v2/cli/utils/click.py +1 -1
  36. unstructured_ingest/v2/interfaces/processor.py +48 -13
  37. unstructured_ingest/v2/logger.py +1 -1
  38. unstructured_ingest/v2/otel.py +1 -1
  39. unstructured_ingest/v2/pipeline/interfaces.py +9 -2
  40. unstructured_ingest/v2/pipeline/pipeline.py +17 -6
  41. unstructured_ingest/v2/pipeline/steps/chunk.py +3 -3
  42. unstructured_ingest/v2/pipeline/steps/download.py +17 -2
  43. unstructured_ingest/v2/pipeline/steps/embed.py +3 -3
  44. unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
  45. unstructured_ingest/v2/pipeline/steps/index.py +2 -2
  46. unstructured_ingest/v2/pipeline/steps/partition.py +3 -3
  47. unstructured_ingest/v2/pipeline/steps/stage.py +1 -1
  48. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
  49. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  50. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  51. unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -1
  52. unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
  53. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +4 -4
  54. unstructured_ingest/v2/processes/connectors/google_drive.py +1 -1
  55. unstructured_ingest/v2/processes/connectors/local.py +6 -5
  56. unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
  57. unstructured_ingest/v2/processes/connectors/onedrive.py +2 -2
  58. unstructured_ingest/v2/processes/connectors/opensearch.py +1 -1
  59. unstructured_ingest/v2/processes/connectors/pinecone.py +2 -2
  60. unstructured_ingest/v2/processes/connectors/sharepoint.py +9 -5
  61. unstructured_ingest/v2/processes/filter.py +1 -1
  62. unstructured_ingest/v2/processes/partitioner.py +3 -3
  63. unstructured_ingest/v2/utils.py +7 -0
  64. {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/METADATA +272 -274
  65. {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/RECORD +69 -69
  66. unstructured_ingest/evaluate.py +0 -338
  67. {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/LICENSE.md +0 -0
  68. {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/WHEEL +0 -0
  69. {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/entry_points.txt +0 -0
  70. {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,235 @@
1
+ from dataclasses import dataclass, field
2
+ from pathlib import Path
3
+ from typing import TYPE_CHECKING, Any, Generator, Optional
4
+ from uuid import NAMESPACE_DNS, uuid5
5
+
6
+ import pandas
7
+ from pydantic import BaseModel, Field, Secret, field_validator
8
+
9
+ from unstructured_ingest.utils.dep_check import requires_dependencies
10
+ from unstructured_ingest.v2.interfaces import (
11
+ AccessConfig,
12
+ ConnectionConfig,
13
+ Downloader,
14
+ DownloaderConfig,
15
+ FileData,
16
+ Indexer,
17
+ IndexerConfig,
18
+ SourceIdentifiers,
19
+ download_responses,
20
+ )
21
+ from unstructured_ingest.v2.processes.connector_registry import (
22
+ SourceRegistryEntry,
23
+ )
24
+
25
+ if TYPE_CHECKING:
26
+ from pyairtable import Api
27
+ from pyairtable.api.types import RecordDict
28
+
29
+ CONNECTOR_TYPE = "airtable"
30
+
31
+
32
+ class AirtableTableMeta(BaseModel):
33
+ """Metadata specifying a table id, a base id which the table is stored in,
34
+ and an t.Optional view id in case particular rows and fields are to be ingested"""
35
+
36
+ base_id: str
37
+ table_id: str
38
+ view_id: Optional[str] = None
39
+
40
+ def get_id(self) -> str:
41
+ id_s = f"{self.base_id}{self.table_id}"
42
+ id_s = f"{id_s}{self.view_id}" if self.view_id else id_s
43
+ return str(uuid5(NAMESPACE_DNS, id_s))
44
+
45
+
46
+ class AirtableAccessConfig(AccessConfig):
47
+ personal_access_token: str = Field(
48
+ description="Personal access token to authenticate into Airtable. Check: "
49
+ "https://support.airtable.com/docs/creating-and-using-api-keys-and-access-tokens "
50
+ "for more info"
51
+ )
52
+
53
+
54
+ class AirtableConnectionConfig(ConnectionConfig):
55
+ access_config: Secret[AirtableAccessConfig]
56
+
57
+ @requires_dependencies(["pyairtable"], extras="airtable")
58
+ def get_client(self) -> "Api":
59
+ from pyairtable import Api
60
+
61
+ access_config = self.access_config.get_secret_value()
62
+ return Api(api_key=access_config.personal_access_token)
63
+
64
+
65
+ class AirtableIndexerConfig(IndexerConfig):
66
+ list_of_paths: Optional[list[str]] = Field(
67
+ default=None,
68
+ description="""
69
+ A list of paths that specify the locations to ingest data from within Airtable.
70
+
71
+ If this argument is not set, the connector ingests all tables within each and every base.
72
+ --list-of-paths: path1 path2 path3 ….
73
+ path: base_id/table_id(optional)/view_id(optional)/
74
+
75
+ To obtain (base, table, view) ids in bulk, check:
76
+ https://airtable.com/developers/web/api/list-bases (base ids)
77
+ https://airtable.com/developers/web/api/get-base-schema (table and view ids)
78
+ https://pyairtable.readthedocs.io/en/latest/metadata.html (base, table and view ids)
79
+
80
+ To obtain specific ids from Airtable UI, go to your workspace, and copy any
81
+ relevant id from the URL structure:
82
+ https://airtable.com/appAbcDeF1ghijKlm/tblABcdEfG1HIJkLm/viwABCDEfg6hijKLM
83
+ appAbcDeF1ghijKlm -> base_id
84
+ tblABcdEfG1HIJkLm -> table_id
85
+ viwABCDEfg6hijKLM -> view_id
86
+
87
+ You can also check: https://support.airtable.com/docs/finding-airtable-ids
88
+
89
+ Here is an example for one --list-of-paths:
90
+ base1/ → gets the entirety of all tables inside base1
91
+ base1/table1 → gets all rows and columns within table1 in base1
92
+ base1/table1/view1 → gets the rows and columns that are
93
+ visible in view1 for the table1 in base1
94
+
95
+ Examples to invalid airtable_paths:
96
+ table1 → has to mention base to be valid
97
+ base1/view1 → has to mention table to be valid
98
+ """,
99
+ )
100
+
101
+ @classmethod
102
+ def validate_path(cls, path: str):
103
+ components = path.split("/")
104
+ if len(components) > 3:
105
+ raise ValueError(
106
+ f"Path must be of the format: base_id/table_id/view_id, "
107
+ f"where table id and view id are optional. Got: {path}"
108
+ )
109
+
110
+ @field_validator("list_of_paths")
111
+ @classmethod
112
+ def validate_format(cls, v: list[str]) -> list[str]:
113
+ for path in v:
114
+ cls.validate_path(path=path)
115
+ return v
116
+
117
+
118
+ @dataclass
119
+ class AirtableIndexer(Indexer):
120
+ connector_type: str = CONNECTOR_TYPE
121
+ connection_config: AirtableConnectionConfig
122
+ index_config: AirtableIndexerConfig
123
+
124
+ def get_all_table_meta(self) -> list[AirtableTableMeta]:
125
+ client = self.connection_config.get_client()
126
+ bases = client.bases()
127
+ airtable_meta = []
128
+ for base in bases:
129
+ for table in base.schema().tables:
130
+ airtable_meta.append(AirtableTableMeta(base_id=base.id, table_id=table.id))
131
+ return airtable_meta
132
+
133
+ def get_base_tables_meta(self, base_id: str) -> list[AirtableTableMeta]:
134
+ client = self.connection_config.get_client()
135
+ base = client.base(base_id=base_id)
136
+ airtable_meta = []
137
+ for table in base.tables():
138
+ airtable_meta.append(AirtableTableMeta(base_id=base.id, table_id=table.id))
139
+ return airtable_meta
140
+
141
+ def get_meta_from_list(self) -> list[AirtableTableMeta]:
142
+ airtable_meta = []
143
+ for path in self.index_config.list_of_paths:
144
+ components = path.split("/")
145
+ if len(components) == 1:
146
+ airtable_meta.extend(self.get_base_tables_meta(base_id=components[0]))
147
+ elif len(components) == 2:
148
+ airtable_meta.append(
149
+ AirtableTableMeta(base_id=components[0], table_id=components[1])
150
+ )
151
+ elif len(components) == 3:
152
+ airtable_meta.append(
153
+ AirtableTableMeta(
154
+ base_id=components[0], table_id=components[1], view_id=components[2]
155
+ )
156
+ )
157
+ else:
158
+ raise ValueError(
159
+ f"Path must be of the format: base_id/table_id/view_id, "
160
+ f"where table id and view id are optional. Got: {path}"
161
+ )
162
+ return airtable_meta
163
+
164
+ def get_table_metas(self) -> list[AirtableTableMeta]:
165
+ if not self.index_config.list_of_paths:
166
+ return self.get_all_table_meta()
167
+ return self.get_meta_from_list()
168
+
169
+ def precheck(self) -> None:
170
+ client = self.connection_config.get_client()
171
+ client.request(method="HEAD", url=client.build_url("meta", "bases"))
172
+
173
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
174
+ table_metas = self.get_table_metas()
175
+ for table_meta in table_metas:
176
+ fullpath = (
177
+ f"{table_meta.base_id}/{table_meta.table_id}/{table_meta.view_id}.csv"
178
+ if table_meta.view_id
179
+ else f"{table_meta.base_id}/{table_meta.table_id}.csv"
180
+ )
181
+ yield FileData(
182
+ identifier=table_meta.get_id(),
183
+ connector_type=CONNECTOR_TYPE,
184
+ additional_metadata=table_meta.dict(),
185
+ source_identifiers=SourceIdentifiers(
186
+ filename=str(Path(fullpath).name),
187
+ fullpath=fullpath,
188
+ ),
189
+ )
190
+
191
+
192
+ class AirtableDownloaderConfig(DownloaderConfig):
193
+ pass
194
+
195
+
196
+ @dataclass
197
+ class AirtableDownloader(Downloader):
198
+ connection_config: AirtableConnectionConfig
199
+ download_config: AirtableDownloaderConfig = field(default_factory=AirtableDownloaderConfig)
200
+ connector_type: str = CONNECTOR_TYPE
201
+
202
+ def get_table_contents(self, table_meta: AirtableTableMeta) -> list["RecordDict"]:
203
+ client = self.connection_config.get_client()
204
+ table = client.table(base_id=table_meta.base_id, table_name=table_meta.table_id)
205
+ table_fetch_kwargs = {"view": table_meta.view_id} if table_meta.view_id else {}
206
+ rows = table.all(**table_fetch_kwargs)
207
+ return rows
208
+
209
+ def _table_row_to_dict(self, table_row: "RecordDict") -> dict:
210
+ row_dict = {
211
+ "id": table_row["id"],
212
+ "created_time": table_row["createdTime"],
213
+ }
214
+ row_dict.update(table_row["fields"])
215
+ return row_dict
216
+
217
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
218
+ table_meta = AirtableTableMeta.model_validate(file_data.additional_metadata)
219
+ table_contents = self.get_table_contents(table_meta=table_meta)
220
+ df = pandas.DataFrame.from_dict(
221
+ data=[self._table_row_to_dict(table_row=row) for row in table_contents]
222
+ ).sort_index(axis=1)
223
+ download_path = self.get_download_path(file_data=file_data)
224
+ download_path.parent.mkdir(parents=True, exist_ok=True)
225
+ df.to_csv(path_or_buf=download_path)
226
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
227
+
228
+
229
+ airtable_source_entry = SourceRegistryEntry(
230
+ indexer=AirtableIndexer,
231
+ indexer_config=AirtableIndexerConfig,
232
+ downloader=AirtableDownloader,
233
+ downloader_config=AirtableDownloaderConfig,
234
+ connection_config=AirtableConnectionConfig,
235
+ )
@@ -104,7 +104,7 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
104
104
  elif access_config.es_api_key:
105
105
  client_input_kwargs["api_key"] = access_config.es_api_key
106
106
  client_input = ElasticsearchClientInput(**client_input_kwargs)
107
- logger.debug(f"Elasticsearch client inputs mapped to: {client_input.dict()}")
107
+ logger.debug(f"elasticsearch client inputs mapped to: {client_input.dict()}")
108
108
  client_kwargs = client_input.dict()
109
109
  client_kwargs["basic_auth"] = (
110
110
  client_input.basic_auth.get_secret_value() if client_input.basic_auth else None
@@ -47,7 +47,7 @@ class BoxConnectionConfig(FsspecConnectionConfig):
47
47
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
48
48
 
49
49
  def get_access_config(self) -> dict[str, Any]:
50
- # Return access_kwargs with oauth. The oauth object can not be stored directly in the config
50
+ # Return access_kwargs with oauth. The oauth object cannot be stored directly in the config
51
51
  # because it is not serializable.
52
52
  from boxsdk import JWTAuth
53
53
 
@@ -317,9 +317,9 @@ class FsspecUploader(Uploader):
317
317
  path_str = str(path.resolve())
318
318
  upload_path = self.get_upload_path(file_data=file_data)
319
319
  if self.fs.exists(path=str(upload_path)) and not self.upload_config.overwrite:
320
- logger.debug(f"Skipping upload of {path} to {upload_path}, file already exists")
320
+ logger.debug(f"skipping upload of {path} to {upload_path}, file already exists")
321
321
  return
322
- logger.debug(f"Writing local file {path_str} to {upload_path}")
322
+ logger.debug(f"writing local file {path_str} to {upload_path}")
323
323
  self.fs.upload(lpath=path_str, rpath=str(upload_path))
324
324
 
325
325
  async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
@@ -328,7 +328,7 @@ class FsspecUploader(Uploader):
328
328
  # Odd that fsspec doesn't run exists() as async even when client support async
329
329
  already_exists = self.fs.exists(path=str(upload_path))
330
330
  if already_exists and not self.upload_config.overwrite:
331
- logger.debug(f"Skipping upload of {path} to {upload_path}, file already exists")
331
+ logger.debug(f"skipping upload of {path} to {upload_path}, file already exists")
332
332
  return
333
- logger.debug(f"Writing local file {path_str} to {upload_path}")
333
+ logger.debug(f"writing local file {path_str} to {upload_path}")
334
334
  self.fs.upload(lpath=path_str, rpath=str(upload_path))
@@ -199,7 +199,7 @@ class GoogleDriveIndexer(Indexer):
199
199
  if extensions:
200
200
  ext_filter = " or ".join([f"fileExtension = '{e}'" for e in extensions])
201
201
  q = f"{q} and ({ext_filter} or mimeType = 'application/vnd.google-apps.folder')"
202
- logger.debug(f"Query used when indexing: {q}")
202
+ logger.debug(f"query used when indexing: {q}")
203
203
  logger.debug("response fields limited to: {}".format(", ".join(self.fields)))
204
204
  done = False
205
205
  page_token = None
@@ -180,14 +180,15 @@ class LocalUploader(Uploader):
180
180
 
181
181
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
182
182
  if source_identifiers := file_data.source_identifiers:
183
- identifiers = source_identifiers
184
183
  rel_path = (
185
- identifiers.relative_path[1:]
186
- if identifiers.relative_path.startswith("/")
187
- else identifiers.relative_path
184
+ source_identifiers.relative_path[1:]
185
+ if source_identifiers.relative_path.startswith("/")
186
+ else source_identifiers.relative_path
188
187
  )
189
188
  new_path = self.upload_config.output_path / Path(rel_path)
190
- final_path = str(new_path).replace(identifiers.filename, f"{identifiers.filename}.json")
189
+ final_path = str(new_path).replace(
190
+ source_identifiers.filename, f"{source_identifiers.filename}.json"
191
+ )
191
192
  else:
192
193
  final_path = self.upload_config.output_path / Path(f"{file_data.identifier}.json")
193
194
  Path(final_path).parent.mkdir(parents=True, exist_ok=True)
@@ -71,7 +71,7 @@ class MilvusUploadStagerConfig(UploadStagerConfig):
71
71
  fields_to_include: Optional[list[str]] = None
72
72
  """If set - list of fields to include in the output.
73
73
  Unspecified fields are removed from the elements.
74
- This action takse place after metadata flattening.
74
+ This action takes place after metadata flattening.
75
75
  Missing fields will cause stager to throw KeyError."""
76
76
 
77
77
  flatten_metadata: bool = True
@@ -213,9 +213,9 @@ class OnedriveDownloader(Downloader):
213
213
  fsize = file.get_property("size", 0)
214
214
  download_path = self.get_download_path(file_data=file_data)
215
215
  download_path.parent.mkdir(parents=True, exist_ok=True)
216
- logger.info(f"Downloading {file_data.source_identifiers.fullpath} to {download_path}")
216
+ logger.info(f"downloading {file_data.source_identifiers.fullpath} to {download_path}")
217
217
  if fsize > MAX_MB_SIZE:
218
- logger.info(f"Downloading file with size: {fsize} bytes in chunks")
218
+ logger.info(f"downloading file with size: {fsize} bytes in chunks")
219
219
  with download_path.open(mode="wb") as f:
220
220
  file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
221
221
  else:
@@ -101,7 +101,7 @@ class OpenSearchConnectionConfig(ConnectionConfig):
101
101
  if self.username and access_config.password:
102
102
  client_input_kwargs["http_auth"] = (self.username, access_config.password)
103
103
  client_input = OpenSearchClientInput(**client_input_kwargs)
104
- logger.debug(f"OpenSearch client inputs mapped to: {client_input.dict()}")
104
+ logger.debug(f"opensearch client inputs mapped to: {client_input.dict()}")
105
105
  client_kwargs = client_input.dict()
106
106
  if client_input.http_auth is not None:
107
107
  client_kwargs["http_auth"] = client_input.http_auth.get_secret_value()
@@ -57,7 +57,7 @@ class PineconeConnectionConfig(ConnectionConfig):
57
57
  )
58
58
 
59
59
  index = pc.Index(name=self.index_name, **index_kwargs)
60
- logger.debug(f"Connected to index: {pc.describe_index(self.index_name)}")
60
+ logger.debug(f"connected to index: {pc.describe_index(self.index_name)}")
61
61
  return index
62
62
 
63
63
 
@@ -166,7 +166,7 @@ class PineconeUploader(Uploader):
166
166
  max_batch_size=self.upload_config.batch_size,
167
167
  )
168
168
  )
169
- logger.info(f"Split doc with {len(elements_dict)} elements into {len(chunks)} batches")
169
+ logger.info(f"split doc with {len(elements_dict)} elements into {len(chunks)} batches")
170
170
 
171
171
  max_pool_threads = min(len(chunks), MAX_POOL_THREADS)
172
172
  if self.upload_config.pool_threads:
@@ -60,13 +60,16 @@ class SharepointAccessConfig(AccessConfig):
60
60
 
61
61
 
62
62
  class SharepointPermissionsConfig(BaseModel):
63
- permissions_application_id: str = Field(description="Microsoft Graph API application id")
64
- permissions_tenant: str = Field(
63
+ permissions_application_id: Optional[str] = Field(
64
+ default=None, description="Microsoft Graph API application id"
65
+ )
66
+ permissions_tenant: Optional[str] = Field(
67
+ default=None,
65
68
  description="url to get permissions data within tenant.",
66
69
  examples=["https://contoso.onmicrosoft.com"],
67
70
  )
68
- permissions_client_cred: SecretStr = Field(
69
- description="Microsoft Graph API application credentials"
71
+ permissions_client_cred: Optional[SecretStr] = Field(
72
+ default=None, description="Microsoft Graph API application credentials"
70
73
  )
71
74
  authority_url: Optional[SecretStr] = Field(
72
75
  repr=False,
@@ -335,7 +338,8 @@ class SharepointIndexer(Indexer):
335
338
  @property
336
339
  def process_permissions(self) -> bool:
337
340
  return (
338
- self.connection_config.permissions_config.permissions_tenant
341
+ self.connection_config.permissions_config is not None
342
+ and self.connection_config.permissions_config.permissions_tenant
339
343
  and self.connection_config.permissions_config.permissions_client_cred.get_secret_value()
340
344
  and self.connection_config.permissions_config.permissions_application_id
341
345
  )
@@ -47,7 +47,7 @@ class Filterer(BaseProcess, ABC):
47
47
  for pattern in patterns:
48
48
  if fnmatch.filter([path], pattern):
49
49
  return True
50
- logger.debug(f"The file {path!r} is discarded as it does not match any given glob.")
50
+ logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
51
51
  return False
52
52
 
53
53
  def run(self, file_data: FileData, **kwargs: Any) -> Optional[FileData]:
@@ -145,7 +145,7 @@ class Partitioner(BaseProcess, ABC):
145
145
  class FileDataSourceMetadata(DataSourceMetadata):
146
146
  filesize_bytes: Optional[int] = None
147
147
 
148
- logger.debug(f"Using local partition with kwargs: {self.config.to_partition_kwargs()}")
148
+ logger.debug(f"using local partition with kwargs: {self.config.to_partition_kwargs()}")
149
149
  logger.debug(f"partitioning file {filename} with metadata {metadata}")
150
150
  elements = partition(
151
151
  filename=str(filename.resolve()),
@@ -165,7 +165,7 @@ class Partitioner(BaseProcess, ABC):
165
165
 
166
166
  partition_request = self.config.to_partition_kwargs()
167
167
 
168
- # Note(austin): PartitionParameters is a Pydantic model in v0.26.0
168
+ # NOTE(austin): PartitionParameters is a Pydantic model in v0.26.0
169
169
  # Prior to this it was a dataclass which doesn't have .__fields
170
170
  try:
171
171
  possible_fields = PartitionParameters.__fields__
@@ -182,7 +182,7 @@ class Partitioner(BaseProcess, ABC):
182
182
  ", ".join([v for v in partition_request if v not in filtered_partition_request])
183
183
  )
184
184
  )
185
- logger.debug(f"Using hosted partitioner with kwargs: {partition_request}")
185
+ logger.debug(f"using hosted partitioner with kwargs: {partition_request}")
186
186
  with open(filename, "rb") as f:
187
187
  files = Files(
188
188
  content=f.read(),
@@ -20,6 +20,11 @@ def is_secret(value: Any) -> bool:
20
20
  def serialize_base_model(model: BaseModel) -> dict:
21
21
  # To get the full serialized dict regardless of if values are marked as Secret
22
22
  model_dict = model.dict()
23
+ return serialize_base_dict(model_dict=model_dict)
24
+
25
+
26
+ def serialize_base_dict(model_dict: dict) -> dict:
27
+ model_dict = model_dict.copy()
23
28
  for k, v in model_dict.items():
24
29
  if isinstance(v, _SecretBase):
25
30
  secret_value = v.get_secret_value()
@@ -27,6 +32,8 @@ def serialize_base_model(model: BaseModel) -> dict:
27
32
  model_dict[k] = serialize_base_model(model=secret_value)
28
33
  else:
29
34
  model_dict[k] = secret_value
35
+ if isinstance(v, dict):
36
+ model_dict[k] = serialize_base_dict(model_dict=v)
30
37
 
31
38
  return model_dict
32
39