sunholo 0.111.0__py3-none-any.whl → 0.112.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sunholo/chunker/loaders.py +21 -12
- {sunholo-0.111.0.dist-info → sunholo-0.112.0.dist-info}/METADATA +6 -4
- {sunholo-0.111.0.dist-info → sunholo-0.112.0.dist-info}/RECORD +7 -7
- {sunholo-0.111.0.dist-info → sunholo-0.112.0.dist-info}/LICENSE.txt +0 -0
- {sunholo-0.111.0.dist-info → sunholo-0.112.0.dist-info}/WHEEL +0 -0
- {sunholo-0.111.0.dist-info → sunholo-0.112.0.dist-info}/entry_points.txt +0 -0
- {sunholo-0.111.0.dist-info → sunholo-0.112.0.dist-info}/top_level.txt +0 -0
sunholo/chunker/loaders.py
CHANGED
|
@@ -11,9 +11,10 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
from
|
|
16
|
-
|
|
14
|
+
try:
|
|
15
|
+
from langchain_unstructured import UnstructuredLoader
|
|
16
|
+
except ImportError:
|
|
17
|
+
UnstructuredLoader = None
|
|
17
18
|
|
|
18
19
|
from langchain_community.document_loaders import GitLoader
|
|
19
20
|
from langchain_community.document_loaders import GoogleDriveLoader
|
|
@@ -159,10 +160,12 @@ def read_gdrive_to_document(url: str, metadata: dict = None):
|
|
|
159
160
|
|
|
160
161
|
def read_url_to_document(url: str, metadata: dict = None):
|
|
161
162
|
|
|
163
|
+
if not UnstructuredLoader:
|
|
164
|
+
raise ImportError("UnstructuredLoader requires 'langchain_unstructured' to be installed")
|
|
162
165
|
unstructured_kwargs = {"pdf_infer_table_structure": True,
|
|
163
166
|
"extract_image_block_types": ["Image", "Table"]
|
|
164
167
|
}
|
|
165
|
-
loader =
|
|
168
|
+
loader = UnstructuredLoader(web_url=url, mode="elements", unstructured_kwargs=unstructured_kwargs)
|
|
166
169
|
docs = loader.load()
|
|
167
170
|
if metadata is not None:
|
|
168
171
|
for doc in docs:
|
|
@@ -170,7 +173,7 @@ def read_url_to_document(url: str, metadata: dict = None):
|
|
|
170
173
|
if not doc.metadata.get("source") and doc.metadata.get("url"):
|
|
171
174
|
doc.metadata["source"] = doc.metadata["url"]
|
|
172
175
|
|
|
173
|
-
log.info(f"
|
|
176
|
+
log.info(f"UnstructuredLoader docs: {docs}")
|
|
174
177
|
|
|
175
178
|
return docs
|
|
176
179
|
|
|
@@ -184,18 +187,21 @@ def read_file_to_documents(gs_file: pathlib.Path, metadata: dict = None):
|
|
|
184
187
|
log.info(f"Already uploaded to bucket, skipping {pdf_path}")
|
|
185
188
|
return []
|
|
186
189
|
|
|
187
|
-
log.info(f"Sending {pdf_path} to
|
|
190
|
+
log.info(f"Sending {pdf_path} to UnstructuredLoader")
|
|
188
191
|
UNSTRUCTURED_URL = os.getenv("UNSTRUCTURED_URL")
|
|
189
192
|
unstructured_kwargs = {"pdf_infer_table_structure": True,
|
|
190
193
|
"extract_image_block_types": ["Image", "Table"]
|
|
191
194
|
}
|
|
192
195
|
|
|
196
|
+
if not UnstructuredLoader:
|
|
197
|
+
raise ImportError("UnstructuredLoader requires 'langchain_unstructured' to be installed")
|
|
198
|
+
|
|
193
199
|
if UNSTRUCTURED_URL:
|
|
194
200
|
log.debug(f"Found UNSTRUCTURED_URL: {UNSTRUCTURED_URL}")
|
|
195
201
|
the_endpoint = f"{UNSTRUCTURED_URL}/general/v0/general"
|
|
196
202
|
try:
|
|
197
|
-
loader =
|
|
198
|
-
pdf_path,
|
|
203
|
+
loader = UnstructuredLoader(
|
|
204
|
+
file_path=pdf_path,
|
|
199
205
|
url=the_endpoint,
|
|
200
206
|
mode="elements",
|
|
201
207
|
**unstructured_kwargs)
|
|
@@ -206,8 +212,8 @@ def read_file_to_documents(gs_file: pathlib.Path, metadata: dict = None):
|
|
|
206
212
|
else:
|
|
207
213
|
raise err
|
|
208
214
|
else:
|
|
209
|
-
loader =
|
|
210
|
-
pdf_path,
|
|
215
|
+
loader = UnstructuredLoader(
|
|
216
|
+
file_path=pdf_path,
|
|
211
217
|
api_key=UNSTRUCTURED_KEY,
|
|
212
218
|
mode="elements",
|
|
213
219
|
**unstructured_kwargs)
|
|
@@ -216,7 +222,7 @@ def read_file_to_documents(gs_file: pathlib.Path, metadata: dict = None):
|
|
|
216
222
|
try:
|
|
217
223
|
docs = loader.load() # this takes a long time 30m+ for big PDF files
|
|
218
224
|
except ValueError as e:
|
|
219
|
-
log.info(f"Error for {gs_file} from
|
|
225
|
+
log.info(f"Error for {gs_file} from UnstructuredLoader: {str(e)}")
|
|
220
226
|
pdf_path = pathlib.Path(gs_file)
|
|
221
227
|
if pdf_path.suffix == ".pdf":
|
|
222
228
|
local_doc = read_pdf_file(pdf_path, metadata=metadata)
|
|
@@ -262,13 +268,16 @@ def read_file_to_documents(gs_file: pathlib.Path, metadata: dict = None):
|
|
|
262
268
|
|
|
263
269
|
def convert_to_txt_and_extract(gs_file, split=False):
|
|
264
270
|
|
|
271
|
+
if not UnstructuredLoader:
|
|
272
|
+
raise ImportError("UnstructuredLoader requires 'langchain_unstructured' to be installed")
|
|
273
|
+
|
|
265
274
|
log.info("trying file parsing locally via .txt conversion")
|
|
266
275
|
txt_file = None
|
|
267
276
|
docs = []
|
|
268
277
|
try:
|
|
269
278
|
# Convert the file to .txt and try again
|
|
270
279
|
txt_file = convert_to_txt(gs_file)
|
|
271
|
-
loader =
|
|
280
|
+
loader = UnstructuredLoader(
|
|
272
281
|
txt_file,
|
|
273
282
|
mode="elements")
|
|
274
283
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: sunholo
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.112.0
|
|
4
4
|
Summary: Large Language Model DevOps - a package to help deploy LLMs to the Cloud.
|
|
5
5
|
Home-page: https://github.com/sunholo-data/sunholo-py
|
|
6
|
-
Download-URL: https://github.com/sunholo-data/sunholo-py/archive/refs/tags/v0.
|
|
6
|
+
Download-URL: https://github.com/sunholo-data/sunholo-py/archive/refs/tags/v0.112.0.tar.gz
|
|
7
7
|
Author: Holosun ApS
|
|
8
8
|
Author-email: multivac@sunholo.com
|
|
9
9
|
License: Apache License, Version 2.0
|
|
@@ -59,6 +59,7 @@ Requires-Dist: langchain-google-genai==1.0.10; extra == "all"
|
|
|
59
59
|
Requires-Dist: langchain_google_alloydb_pg; extra == "all"
|
|
60
60
|
Requires-Dist: langchain-anthropic==0.1.23; extra == "all"
|
|
61
61
|
Requires-Dist: langchain-google-vertexai; extra == "all"
|
|
62
|
+
Requires-Dist: langchain-unstructured; extra == "all"
|
|
62
63
|
Requires-Dist: langfuse; extra == "all"
|
|
63
64
|
Requires-Dist: numpy; extra == "all"
|
|
64
65
|
Requires-Dist: pg8000; extra == "all"
|
|
@@ -78,7 +79,7 @@ Requires-Dist: tabulate; extra == "all"
|
|
|
78
79
|
Requires-Dist: tantivy; extra == "all"
|
|
79
80
|
Requires-Dist: tenacity; extra == "all"
|
|
80
81
|
Requires-Dist: tiktoken; extra == "all"
|
|
81
|
-
Requires-Dist: unstructured[local-inference]
|
|
82
|
+
Requires-Dist: unstructured[all-docs,local-inference]; extra == "all"
|
|
82
83
|
Requires-Dist: xlwings; extra == "all"
|
|
83
84
|
Provides-Extra: azure
|
|
84
85
|
Requires-Dist: azure-identity; extra == "azure"
|
|
@@ -98,11 +99,12 @@ Requires-Dist: tantivy; extra == "database"
|
|
|
98
99
|
Provides-Extra: pipeline
|
|
99
100
|
Requires-Dist: GitPython; extra == "pipeline"
|
|
100
101
|
Requires-Dist: lark; extra == "pipeline"
|
|
102
|
+
Requires-Dist: langchain-unstructured; extra == "pipeline"
|
|
101
103
|
Requires-Dist: psutil; extra == "pipeline"
|
|
102
104
|
Requires-Dist: pypdf; extra == "pipeline"
|
|
103
105
|
Requires-Dist: pytesseract; extra == "pipeline"
|
|
104
106
|
Requires-Dist: tabulate; extra == "pipeline"
|
|
105
|
-
Requires-Dist: unstructured[local-inference]
|
|
107
|
+
Requires-Dist: unstructured[all-docs,local-inference]; extra == "pipeline"
|
|
106
108
|
Provides-Extra: gcp
|
|
107
109
|
Requires-Dist: anthropic[vertex]; extra == "gcp"
|
|
108
110
|
Requires-Dist: google-api-python-client; extra == "gcp"
|
|
@@ -34,7 +34,7 @@ sunholo/chunker/azure.py,sha256=MVF9_-QdKUoJqlpEJ49pv2sdjMDxEiMNxzmO7w5nWDQ,3270
|
|
|
34
34
|
sunholo/chunker/doc_handling.py,sha256=UAf9BmUMpKCKRlAMl1qNZK6xDNYWk1z3ARoftWoa_54,8734
|
|
35
35
|
sunholo/chunker/encode_metadata.py,sha256=hxxd9KU35Xi0Z_EL8kt_oD66pKfBLhEjBImC16ew-Eo,1919
|
|
36
36
|
sunholo/chunker/images.py,sha256=id2PBu6XyGEOtgafq2v0c9_O6kxaC_pYFMnbsIitkSg,1868
|
|
37
|
-
sunholo/chunker/loaders.py,sha256=
|
|
37
|
+
sunholo/chunker/loaders.py,sha256=QaM-M1wmbA2iLIDvBKpC5-TPKMlQIxS01gMKj5n9RyM,10547
|
|
38
38
|
sunholo/chunker/message_data.py,sha256=EaiY7_HClpcfPUAYaAm6Zk5ReeZ9s9F_jBVd0kDgI-4,10836
|
|
39
39
|
sunholo/chunker/pdfs.py,sha256=njDPop751GMHi3cOwIKd2Yct-_lWR2gqcB7WykfHphs,2480
|
|
40
40
|
sunholo/chunker/process_chunker_data.py,sha256=uO-YOEHIjAOy0ZMJ0vea9OMNsQBISHfhbtgoyuHiP6s,3598
|
|
@@ -150,9 +150,9 @@ sunholo/vertex/init.py,sha256=1OQwcPBKZYBTDPdyU7IM4X4OmiXLdsNV30C-fee2scQ,2875
|
|
|
150
150
|
sunholo/vertex/memory_tools.py,sha256=tBZxqVZ4InTmdBvLlOYwoSEWu4-kGquc-gxDwZCC4FA,7667
|
|
151
151
|
sunholo/vertex/safety.py,sha256=S9PgQT1O_BQAkcqauWncRJaydiP8Q_Jzmu9gxYfy1VA,2482
|
|
152
152
|
sunholo/vertex/type_dict_to_json.py,sha256=uTzL4o9tJRao4u-gJOFcACgWGkBOtqACmb6ihvCErL8,4694
|
|
153
|
-
sunholo-0.
|
|
154
|
-
sunholo-0.
|
|
155
|
-
sunholo-0.
|
|
156
|
-
sunholo-0.
|
|
157
|
-
sunholo-0.
|
|
158
|
-
sunholo-0.
|
|
153
|
+
sunholo-0.112.0.dist-info/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
|
|
154
|
+
sunholo-0.112.0.dist-info/METADATA,sha256=kiphk-fAQurwpMyZVdgAqVdhS4yUr8AJPLjUAZze2_I,8685
|
|
155
|
+
sunholo-0.112.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
156
|
+
sunholo-0.112.0.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
|
|
157
|
+
sunholo-0.112.0.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
|
|
158
|
+
sunholo-0.112.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|