sunholo 0.111.0__py3-none-any.whl → 0.112.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,9 +11,10 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- from langchain_community.document_loaders import UnstructuredFileLoader
15
- from langchain_community.document_loaders import UnstructuredAPIFileLoader
16
- from langchain_community.document_loaders import UnstructuredURLLoader
14
+ try:
15
+ from langchain_unstructured import UnstructuredLoader
16
+ except ImportError:
17
+ UnstructuredLoader = None
17
18
 
18
19
  from langchain_community.document_loaders import GitLoader
19
20
  from langchain_community.document_loaders import GoogleDriveLoader
@@ -159,10 +160,12 @@ def read_gdrive_to_document(url: str, metadata: dict = None):
159
160
 
160
161
  def read_url_to_document(url: str, metadata: dict = None):
161
162
 
163
+ if not UnstructuredLoader:
164
+ raise ImportError("UnstructuredLoader requires 'langchain_unstructured' to be installed")
162
165
  unstructured_kwargs = {"pdf_infer_table_structure": True,
163
166
  "extract_image_block_types": ["Image", "Table"]
164
167
  }
165
- loader = UnstructuredURLLoader(urls=[url], mode="elements", unstructured_kwargs=unstructured_kwargs)
168
+ loader = UnstructuredLoader(web_url=url, mode="elements", unstructured_kwargs=unstructured_kwargs)
166
169
  docs = loader.load()
167
170
  if metadata is not None:
168
171
  for doc in docs:
@@ -170,7 +173,7 @@ def read_url_to_document(url: str, metadata: dict = None):
170
173
  if not doc.metadata.get("source") and doc.metadata.get("url"):
171
174
  doc.metadata["source"] = doc.metadata["url"]
172
175
 
173
- log.info(f"UnstructuredURLLoader docs: {docs}")
176
+ log.info(f"UnstructuredLoader docs: {docs}")
174
177
 
175
178
  return docs
176
179
 
@@ -184,18 +187,21 @@ def read_file_to_documents(gs_file: pathlib.Path, metadata: dict = None):
184
187
  log.info(f"Already uploaded to bucket, skipping {pdf_path}")
185
188
  return []
186
189
 
187
- log.info(f"Sending {pdf_path} to UnstructuredAPIFileLoader")
190
+ log.info(f"Sending {pdf_path} to UnstructuredLoader")
188
191
  UNSTRUCTURED_URL = os.getenv("UNSTRUCTURED_URL")
189
192
  unstructured_kwargs = {"pdf_infer_table_structure": True,
190
193
  "extract_image_block_types": ["Image", "Table"]
191
194
  }
192
195
 
196
+ if not UnstructuredLoader:
197
+ raise ImportError("UnstructuredLoader requires 'langchain_unstructured' to be installed")
198
+
193
199
  if UNSTRUCTURED_URL:
194
200
  log.debug(f"Found UNSTRUCTURED_URL: {UNSTRUCTURED_URL}")
195
201
  the_endpoint = f"{UNSTRUCTURED_URL}/general/v0/general"
196
202
  try:
197
- loader = UnstructuredAPIFileLoader(
198
- pdf_path,
203
+ loader = UnstructuredLoader(
204
+ file_path=pdf_path,
199
205
  url=the_endpoint,
200
206
  mode="elements",
201
207
  **unstructured_kwargs)
@@ -206,8 +212,8 @@ def read_file_to_documents(gs_file: pathlib.Path, metadata: dict = None):
206
212
  else:
207
213
  raise err
208
214
  else:
209
- loader = UnstructuredAPIFileLoader(
210
- pdf_path,
215
+ loader = UnstructuredLoader(
216
+ file_path=pdf_path,
211
217
  api_key=UNSTRUCTURED_KEY,
212
218
  mode="elements",
213
219
  **unstructured_kwargs)
@@ -216,7 +222,7 @@ def read_file_to_documents(gs_file: pathlib.Path, metadata: dict = None):
216
222
  try:
217
223
  docs = loader.load() # this takes a long time 30m+ for big PDF files
218
224
  except ValueError as e:
219
- log.info(f"Error for {gs_file} from UnstructuredAPIFileLoader: {str(e)}")
225
+ log.info(f"Error for {gs_file} from UnstructuredLoader: {str(e)}")
220
226
  pdf_path = pathlib.Path(gs_file)
221
227
  if pdf_path.suffix == ".pdf":
222
228
  local_doc = read_pdf_file(pdf_path, metadata=metadata)
@@ -262,13 +268,16 @@ def read_file_to_documents(gs_file: pathlib.Path, metadata: dict = None):
262
268
 
263
269
  def convert_to_txt_and_extract(gs_file, split=False):
264
270
 
271
+ if not UnstructuredLoader:
272
+ raise ImportError("UnstructuredLoader requires 'langchain_unstructured' to be installed")
273
+
265
274
  log.info("trying file parsing locally via .txt conversion")
266
275
  txt_file = None
267
276
  docs = []
268
277
  try:
269
278
  # Convert the file to .txt and try again
270
279
  txt_file = convert_to_txt(gs_file)
271
- loader = UnstructuredFileLoader(
280
+ loader = UnstructuredLoader(
272
281
  txt_file,
273
282
  mode="elements")
274
283
 
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sunholo
3
- Version: 0.111.0
3
+ Version: 0.112.0
4
4
  Summary: Large Language Model DevOps - a package to help deploy LLMs to the Cloud.
5
5
  Home-page: https://github.com/sunholo-data/sunholo-py
6
- Download-URL: https://github.com/sunholo-data/sunholo-py/archive/refs/tags/v0.111.0.tar.gz
6
+ Download-URL: https://github.com/sunholo-data/sunholo-py/archive/refs/tags/v0.112.0.tar.gz
7
7
  Author: Holosun ApS
8
8
  Author-email: multivac@sunholo.com
9
9
  License: Apache License, Version 2.0
@@ -59,6 +59,7 @@ Requires-Dist: langchain-google-genai==1.0.10; extra == "all"
59
59
  Requires-Dist: langchain_google_alloydb_pg; extra == "all"
60
60
  Requires-Dist: langchain-anthropic==0.1.23; extra == "all"
61
61
  Requires-Dist: langchain-google-vertexai; extra == "all"
62
+ Requires-Dist: langchain-unstructured; extra == "all"
62
63
  Requires-Dist: langfuse; extra == "all"
63
64
  Requires-Dist: numpy; extra == "all"
64
65
  Requires-Dist: pg8000; extra == "all"
@@ -78,7 +79,7 @@ Requires-Dist: tabulate; extra == "all"
78
79
  Requires-Dist: tantivy; extra == "all"
79
80
  Requires-Dist: tenacity; extra == "all"
80
81
  Requires-Dist: tiktoken; extra == "all"
81
- Requires-Dist: unstructured[local-inference]==0.14.9; extra == "all"
82
+ Requires-Dist: unstructured[all-docs,local-inference]; extra == "all"
82
83
  Requires-Dist: xlwings; extra == "all"
83
84
  Provides-Extra: azure
84
85
  Requires-Dist: azure-identity; extra == "azure"
@@ -98,11 +99,12 @@ Requires-Dist: tantivy; extra == "database"
98
99
  Provides-Extra: pipeline
99
100
  Requires-Dist: GitPython; extra == "pipeline"
100
101
  Requires-Dist: lark; extra == "pipeline"
102
+ Requires-Dist: langchain-unstructured; extra == "pipeline"
101
103
  Requires-Dist: psutil; extra == "pipeline"
102
104
  Requires-Dist: pypdf; extra == "pipeline"
103
105
  Requires-Dist: pytesseract; extra == "pipeline"
104
106
  Requires-Dist: tabulate; extra == "pipeline"
105
- Requires-Dist: unstructured[local-inference]==0.14.9; extra == "pipeline"
107
+ Requires-Dist: unstructured[all-docs,local-inference]; extra == "pipeline"
106
108
  Provides-Extra: gcp
107
109
  Requires-Dist: anthropic[vertex]; extra == "gcp"
108
110
  Requires-Dist: google-api-python-client; extra == "gcp"
@@ -34,7 +34,7 @@ sunholo/chunker/azure.py,sha256=MVF9_-QdKUoJqlpEJ49pv2sdjMDxEiMNxzmO7w5nWDQ,3270
34
34
  sunholo/chunker/doc_handling.py,sha256=UAf9BmUMpKCKRlAMl1qNZK6xDNYWk1z3ARoftWoa_54,8734
35
35
  sunholo/chunker/encode_metadata.py,sha256=hxxd9KU35Xi0Z_EL8kt_oD66pKfBLhEjBImC16ew-Eo,1919
36
36
  sunholo/chunker/images.py,sha256=id2PBu6XyGEOtgafq2v0c9_O6kxaC_pYFMnbsIitkSg,1868
37
- sunholo/chunker/loaders.py,sha256=n64UgXDB8ZkApnqJwb3VKc8lGTVo8TC1AXmARiaCsQY,10269
37
+ sunholo/chunker/loaders.py,sha256=QaM-M1wmbA2iLIDvBKpC5-TPKMlQIxS01gMKj5n9RyM,10547
38
38
  sunholo/chunker/message_data.py,sha256=EaiY7_HClpcfPUAYaAm6Zk5ReeZ9s9F_jBVd0kDgI-4,10836
39
39
  sunholo/chunker/pdfs.py,sha256=njDPop751GMHi3cOwIKd2Yct-_lWR2gqcB7WykfHphs,2480
40
40
  sunholo/chunker/process_chunker_data.py,sha256=uO-YOEHIjAOy0ZMJ0vea9OMNsQBISHfhbtgoyuHiP6s,3598
@@ -150,9 +150,9 @@ sunholo/vertex/init.py,sha256=1OQwcPBKZYBTDPdyU7IM4X4OmiXLdsNV30C-fee2scQ,2875
150
150
  sunholo/vertex/memory_tools.py,sha256=tBZxqVZ4InTmdBvLlOYwoSEWu4-kGquc-gxDwZCC4FA,7667
151
151
  sunholo/vertex/safety.py,sha256=S9PgQT1O_BQAkcqauWncRJaydiP8Q_Jzmu9gxYfy1VA,2482
152
152
  sunholo/vertex/type_dict_to_json.py,sha256=uTzL4o9tJRao4u-gJOFcACgWGkBOtqACmb6ihvCErL8,4694
153
- sunholo-0.111.0.dist-info/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
154
- sunholo-0.111.0.dist-info/METADATA,sha256=DC7Aa6AgAANAsvuyg3h0DLz-d0auWyaZu5oviSIgwpU,8570
155
- sunholo-0.111.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
156
- sunholo-0.111.0.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
157
- sunholo-0.111.0.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
158
- sunholo-0.111.0.dist-info/RECORD,,
153
+ sunholo-0.112.0.dist-info/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
154
+ sunholo-0.112.0.dist-info/METADATA,sha256=kiphk-fAQurwpMyZVdgAqVdhS4yUr8AJPLjUAZze2_I,8685
155
+ sunholo-0.112.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
156
+ sunholo-0.112.0.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
157
+ sunholo-0.112.0.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
158
+ sunholo-0.112.0.dist-info/RECORD,,