unstructured-ingest 0.0.23__py3-none-any.whl → 0.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "0.0.23" # pragma: no cover
1
+ __version__ = "0.0.24" # pragma: no cover
@@ -155,14 +155,14 @@ def _get_type_from_field(field: FieldInfo) -> click.ParamType:
155
155
 
156
156
  def get_option_from_field(option_name: str, field_info: FieldInfo) -> Option:
157
157
  param_decls = [option_name]
158
- help = field_info.description or ""
158
+ help_text = field_info.description or ""
159
159
  if examples := field_info.examples:
160
- help += f" [Examples: {', '.join(examples)}]"
160
+ help_text += f" [Examples: {', '.join(examples)}]"
161
161
  option_kwargs = {
162
162
  "type": _get_type_from_field(field_info),
163
163
  "default": get_default_value_from_field(field_info),
164
164
  "required": field_info.is_required(),
165
- "help": help,
165
+ "help": str(help_text),
166
166
  "is_flag": is_boolean_flag(field_info),
167
167
  "show_default": field_info.default is not PydanticUndefined,
168
168
  }
@@ -58,20 +58,6 @@ class PineconeConnectionConfig(ConnectionConfig):
58
58
  return index
59
59
 
60
60
 
61
- class PineconeUploadStagerConfig(UploadStagerConfig):
62
- pass
63
-
64
-
65
- class PineconeUploaderConfig(UploaderConfig):
66
- batch_size: Optional[int] = Field(
67
- default=None,
68
- description="Optional number of records per batch. Will otherwise limit by size.",
69
- )
70
- pool_threads: Optional[int] = Field(
71
- default=1, description="Optional limit on number of threads to use for upload"
72
- )
73
-
74
-
75
61
  ALLOWED_FIELDS = (
76
62
  "element_id",
77
63
  "text",
@@ -86,31 +72,56 @@ ALLOWED_FIELDS = (
86
72
  "is_continuation",
87
73
  "link_urls",
88
74
  "link_texts",
75
+ "text_as_html",
89
76
  )
90
77
 
91
78
 
79
+ class PineconeUploadStagerConfig(UploadStagerConfig):
80
+ metadata_fields: list[str] = Field(
81
+ default=str(ALLOWED_FIELDS),
82
+ description=(
83
+ "which metadata from the source element to map to the payload metadata being sent to "
84
+ "Pinecone."
85
+ ),
86
+ )
87
+
88
+
89
+ class PineconeUploaderConfig(UploaderConfig):
90
+ batch_size: Optional[int] = Field(
91
+ default=None,
92
+ description="Optional number of records per batch. Will otherwise limit by size.",
93
+ )
94
+ pool_threads: Optional[int] = Field(
95
+ default=1, description="Optional limit on number of threads to use for upload"
96
+ )
97
+
98
+
92
99
  @dataclass
93
100
  class PineconeUploadStager(UploadStager):
94
101
  upload_stager_config: PineconeUploadStagerConfig = field(
95
102
  default_factory=lambda: PineconeUploadStagerConfig()
96
103
  )
97
104
 
98
- @staticmethod
99
- def conform_dict(element_dict: dict) -> dict:
105
+ def conform_dict(self, element_dict: dict) -> dict:
100
106
  embeddings = element_dict.pop("embeddings", None)
101
107
  metadata: dict[str, Any] = element_dict.pop("metadata", {})
102
108
  data_source = metadata.pop("data_source", {})
103
109
  coordinates = metadata.pop("coordinates", {})
104
-
105
- element_dict.update(metadata)
106
- element_dict.update(data_source)
107
- element_dict.update(coordinates)
110
+ pinecone_metadata = {}
111
+ for possible_meta in [element_dict, metadata, data_source, coordinates]:
112
+ pinecone_metadata.update(
113
+ {
114
+ k: v
115
+ for k, v in possible_meta.items()
116
+ if k in self.upload_stager_config.metadata_fields
117
+ }
118
+ )
108
119
 
109
120
  return {
110
121
  "id": str(uuid.uuid4()),
111
122
  "values": embeddings,
112
123
  "metadata": flatten_dict(
113
- {k: v for k, v in element_dict.items() if k in ALLOWED_FIELDS},
124
+ pinecone_metadata,
114
125
  separator="-",
115
126
  flatten_lists=True,
116
127
  remove_none=True,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unstructured-ingest
3
- Version: 0.0.23
3
+ Version: 0.0.24
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -23,19 +23,19 @@ Requires-Python: >=3.9.0,<3.13
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
25
  Requires-Dist: pydantic>=2.7
26
- Requires-Dist: python-dateutil
26
+ Requires-Dist: tqdm
27
27
  Requires-Dist: click
28
+ Requires-Dist: python-dateutil
28
29
  Requires-Dist: opentelemetry-sdk
29
30
  Requires-Dist: pandas
30
31
  Requires-Dist: dataclasses-json
31
- Requires-Dist: tqdm
32
32
  Provides-Extra: airtable
33
33
  Requires-Dist: pyairtable; extra == "airtable"
34
34
  Provides-Extra: astradb
35
35
  Requires-Dist: astrapy; extra == "astradb"
36
36
  Provides-Extra: azure
37
- Requires-Dist: fsspec; extra == "azure"
38
37
  Requires-Dist: adlfs; extra == "azure"
38
+ Requires-Dist: fsspec; extra == "azure"
39
39
  Provides-Extra: azure-cognitive-search
40
40
  Requires-Dist: azure-search-documents; extra == "azure-cognitive-search"
41
41
  Provides-Extra: bedrock
@@ -44,8 +44,8 @@ Provides-Extra: biomed
44
44
  Requires-Dist: bs4; extra == "biomed"
45
45
  Requires-Dist: requests; extra == "biomed"
46
46
  Provides-Extra: box
47
- Requires-Dist: fsspec; extra == "box"
48
47
  Requires-Dist: boxfs; extra == "box"
48
+ Requires-Dist: fsspec; extra == "box"
49
49
  Provides-Extra: chroma
50
50
  Requires-Dist: chromadb; extra == "chroma"
51
51
  Provides-Extra: clarifai
@@ -60,8 +60,8 @@ Requires-Dist: unstructured[tsv]; extra == "csv"
60
60
  Provides-Extra: databricks-volumes
61
61
  Requires-Dist: databricks-sdk; extra == "databricks-volumes"
62
62
  Provides-Extra: delta-table
63
- Requires-Dist: fsspec; extra == "delta-table"
64
63
  Requires-Dist: deltalake; extra == "delta-table"
64
+ Requires-Dist: fsspec; extra == "delta-table"
65
65
  Provides-Extra: discord
66
66
  Requires-Dist: discord-py; extra == "discord"
67
67
  Provides-Extra: doc
@@ -69,8 +69,8 @@ Requires-Dist: unstructured[docx]; extra == "doc"
69
69
  Provides-Extra: docx
70
70
  Requires-Dist: unstructured[docx]; extra == "docx"
71
71
  Provides-Extra: dropbox
72
- Requires-Dist: fsspec; extra == "dropbox"
73
72
  Requires-Dist: dropboxdrivefs; extra == "dropbox"
73
+ Requires-Dist: fsspec; extra == "dropbox"
74
74
  Provides-Extra: elasticsearch
75
75
  Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
76
76
  Provides-Extra: embed-huggingface
@@ -87,19 +87,19 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
87
87
  Provides-Extra: epub
88
88
  Requires-Dist: unstructured[epub]; extra == "epub"
89
89
  Provides-Extra: gcs
90
- Requires-Dist: bs4; extra == "gcs"
91
- Requires-Dist: fsspec; extra == "gcs"
92
90
  Requires-Dist: gcsfs; extra == "gcs"
91
+ Requires-Dist: fsspec; extra == "gcs"
92
+ Requires-Dist: bs4; extra == "gcs"
93
93
  Provides-Extra: github
94
- Requires-Dist: requests; extra == "github"
95
94
  Requires-Dist: pygithub>1.58.0; extra == "github"
95
+ Requires-Dist: requests; extra == "github"
96
96
  Provides-Extra: gitlab
97
97
  Requires-Dist: python-gitlab; extra == "gitlab"
98
98
  Provides-Extra: google-drive
99
99
  Requires-Dist: google-api-python-client; extra == "google-drive"
100
100
  Provides-Extra: hubspot
101
- Requires-Dist: urllib3; extra == "hubspot"
102
101
  Requires-Dist: hubspot-api-client; extra == "hubspot"
102
+ Requires-Dist: urllib3; extra == "hubspot"
103
103
  Provides-Extra: jira
104
104
  Requires-Dist: atlassian-python-api; extra == "jira"
105
105
  Provides-Extra: kafka
@@ -115,16 +115,16 @@ Requires-Dist: pymongo; extra == "mongodb"
115
115
  Provides-Extra: msg
116
116
  Requires-Dist: unstructured[msg]; extra == "msg"
117
117
  Provides-Extra: notion
118
+ Requires-Dist: notion-client; extra == "notion"
118
119
  Requires-Dist: backoff; extra == "notion"
119
- Requires-Dist: httpx; extra == "notion"
120
120
  Requires-Dist: htmlBuilder; extra == "notion"
121
- Requires-Dist: notion-client; extra == "notion"
121
+ Requires-Dist: httpx; extra == "notion"
122
122
  Provides-Extra: odt
123
123
  Requires-Dist: unstructured[odt]; extra == "odt"
124
124
  Provides-Extra: onedrive
125
- Requires-Dist: bs4; extra == "onedrive"
126
125
  Requires-Dist: msal; extra == "onedrive"
127
126
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
127
+ Requires-Dist: bs4; extra == "onedrive"
128
128
  Provides-Extra: openai
129
129
  Requires-Dist: openai; extra == "openai"
130
130
  Requires-Dist: tiktoken; extra == "openai"
@@ -156,8 +156,8 @@ Requires-Dist: unstructured[rst]; extra == "rst"
156
156
  Provides-Extra: rtf
157
157
  Requires-Dist: unstructured[rtf]; extra == "rtf"
158
158
  Provides-Extra: s3
159
- Requires-Dist: fsspec; extra == "s3"
160
159
  Requires-Dist: s3fs; extra == "s3"
160
+ Requires-Dist: fsspec; extra == "s3"
161
161
  Provides-Extra: salesforce
162
162
  Requires-Dist: simple-salesforce; extra == "salesforce"
163
163
  Provides-Extra: sftp
@@ -1,5 +1,5 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=HgbcmBIk6mQp0Bz81M53L-kPIBJnMYIFOGkRL73EChs,43
2
+ unstructured_ingest/__version__.py,sha256=i77-gjXpw3EQpetJm6qwuhTR53KoBsdCYSBjHDaGJUQ,43
3
3
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
4
  unstructured_ingest/interfaces.py,sha256=0r0gQoHJQ4DVSQEVbUPBA3N6WyvGMkR1u6U2SwUvoAQ,31361
5
5
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -277,7 +277,7 @@ unstructured_ingest/v2/cli/base/importer.py,sha256=nRt0QQ3qpi264-n_mR0l55C2ddM8n
277
277
  unstructured_ingest/v2/cli/base/src.py,sha256=cpQ43qQju4e5s_YSaPxUtA70BaisRkTBdjtlPhqn5Mg,2872
278
278
  unstructured_ingest/v2/cli/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
279
279
  unstructured_ingest/v2/cli/utils/click.py,sha256=Wn2s3PuvBCKB0lsK-W7X_Y0eYyWnS6Y9wWo1OhVBOzY,6344
280
- unstructured_ingest/v2/cli/utils/model_conversion.py,sha256=73DKHQQ6Tm0Lz5NCRduDlyfOhY2KH-MZN1n6jUgrsuU,7480
280
+ unstructured_ingest/v2/cli/utils/model_conversion.py,sha256=uJQKpbTC5ysOdVaRq2SWEjG8btBimVZYzX9NVL7xnzs,7500
281
281
  unstructured_ingest/v2/interfaces/__init__.py,sha256=Rfa8crx6De7WNOK-EjsWWwFVpsUfCc6gY8B8tQ3ae9I,899
282
282
  unstructured_ingest/v2/interfaces/connector.py,sha256=KG0pHdAcpuO5h72xrAkJzADmjxbav31TZ2Wo3PBvwT0,765
283
283
  unstructured_ingest/v2/interfaces/downloader.py,sha256=PKT1kr79Mz1urW_8xCyq9sBuK93gDvyTXg5e4ma4htU,2871
@@ -323,7 +323,7 @@ unstructured_ingest/v2/processes/connectors/milvus.py,sha256=ZUlyAQyTt0U1JoapFYH
323
323
  unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=2_R_hrEAaTU4vJTCK9oKblWTgv6BKjyUhFtC7uq3q2w,4859
324
324
  unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=ZiUo-dFo1LMOvFwphSLRZiR1PcrN8GWLTHhsh4TU6n0,9207
325
325
  unstructured_ingest/v2/processes/connectors/opensearch.py,sha256=dfDSNrWIEk19wuHdlMJpp_SLMOteNPlkDBPlAwu1LVY,6767
326
- unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=gCueI1Px7UkI1flNovLMRvcbPGczHI3IlYhOPYlb3WU,6748
326
+ unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=8St-JaVrDdQEVZpRS_TfjFusfjg0bAg3IYyykGFyWdw,7169
327
327
  unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
328
328
  unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=hOaV5gBcHFc6N5Rbu3MgM-5Aol1ht-QkNIN4PqjvfxE,19665
329
329
  unstructured_ingest/v2/processes/connectors/singlestore.py,sha256=4rVvWKK2iQr03Ff6cB5zjfE1MpN0JyIGpCxxFCDI6hc,5563
@@ -339,9 +339,9 @@ unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=-_pYHbsBG9FyRyN
339
339
  unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=je1BDqFWlyMfPa4oAMMNFQLLQtCY9quuqx3xjTwF8OQ,6251
340
340
  unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=dwpyqDq0qceCBWX3zM1hiUlgXB4hzX6ObOr-sh-5CJs,6926
341
341
  unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
342
- unstructured_ingest-0.0.23.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
343
- unstructured_ingest-0.0.23.dist-info/METADATA,sha256=iWfV6hzGvmClCO7_huz8s-h9FST1mJsc-mUHZQaGQU4,7108
344
- unstructured_ingest-0.0.23.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
345
- unstructured_ingest-0.0.23.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
346
- unstructured_ingest-0.0.23.dist-info/top_level.txt,sha256=QaTxTcjfM5Hr9sZJ6weOJvSe5ESQc0F8AWkhHInTCf8,20
347
- unstructured_ingest-0.0.23.dist-info/RECORD,,
342
+ unstructured_ingest-0.0.24.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
343
+ unstructured_ingest-0.0.24.dist-info/METADATA,sha256=rHTF8fy1vNg5NmCBNVdobYWeGgpn_PBKao2z54UbgnE,7108
344
+ unstructured_ingest-0.0.24.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
345
+ unstructured_ingest-0.0.24.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
346
+ unstructured_ingest-0.0.24.dist-info/top_level.txt,sha256=QaTxTcjfM5Hr9sZJ6weOJvSe5ESQc0F8AWkhHInTCf8,20
347
+ unstructured_ingest-0.0.24.dist-info/RECORD,,