structifyai 1.172.0__py3-none-any.whl → 1.173.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- structify/_version.py +1 -1
- structify/resources/polars.py +11 -12
- structify/types/chat_create_session_params.py +1 -0
- structify/types/code_generate_code_params.py +1 -0
- structify/types/job_event_body.py +8 -0
- {structifyai-1.172.0.dist-info → structifyai-1.173.0.dist-info}/METADATA +1 -1
- {structifyai-1.172.0.dist-info → structifyai-1.173.0.dist-info}/RECORD +9 -9
- {structifyai-1.172.0.dist-info → structifyai-1.173.0.dist-info}/WHEEL +0 -0
- {structifyai-1.172.0.dist-info → structifyai-1.173.0.dist-info}/licenses/LICENSE +0 -0
structify/_version.py
CHANGED
structify/resources/polars.py
CHANGED
|
@@ -935,20 +935,24 @@ class PolarsResource(SyncAPIResource):
|
|
|
935
935
|
# Wait for all PDF processing jobs to complete
|
|
936
936
|
self._client.jobs.wait_for_jobs(job_ids=job_ids, title=f"Parsing {table_name} from PDFs", node_id=node_id)
|
|
937
937
|
|
|
938
|
-
# Collect results from all processed PDFs
|
|
938
|
+
# Collect results from all processed PDFs - each result is tagged with its source row_idx
|
|
939
939
|
structured_results: list[dict[str, Any]] = []
|
|
940
940
|
|
|
941
941
|
def collect_pdf_results(row_idx: int, dataset_name: str) -> List[Dict[str, Any]]:
|
|
942
942
|
pdf_path = batch_rows[row_idx][path_column]
|
|
943
943
|
entities_result = self._client.datasets.view_table(dataset=dataset_name, name=table_name)
|
|
944
|
-
return [
|
|
944
|
+
return [
|
|
945
|
+
{**entity.properties, path_column: pdf_path, "__row_idx__": row_idx} for entity in entities_result
|
|
946
|
+
]
|
|
945
947
|
|
|
946
948
|
with ThreadPoolExecutor(max_workers=MAX_PARALLEL_REQUESTS) as executor:
|
|
947
949
|
collect_futures = [
|
|
948
950
|
executor.submit(collect_pdf_results, row_idx, dataset_name)
|
|
949
951
|
for row_idx, dataset_name in idx_to_dataset.items()
|
|
950
952
|
]
|
|
951
|
-
for future in tqdm(
|
|
953
|
+
for future in tqdm(
|
|
954
|
+
as_completed(collect_futures), total=len(collect_futures), desc="Collecting PDF extractions"
|
|
955
|
+
):
|
|
952
956
|
results = future.result()
|
|
953
957
|
structured_results.extend(results)
|
|
954
958
|
|
|
@@ -958,17 +962,12 @@ class PolarsResource(SyncAPIResource):
|
|
|
958
962
|
if col_name not in result_row:
|
|
959
963
|
result_row[col_name] = None
|
|
960
964
|
|
|
961
|
-
# Create DataFrame with structured results
|
|
962
965
|
if not structured_results:
|
|
963
|
-
|
|
964
|
-
{col: pl.Series([], dtype=polars_schema[col]) for col in polars_schema.names()}
|
|
965
|
-
)
|
|
966
|
-
else:
|
|
967
|
-
structured_df = pl.DataFrame(structured_results, schema=polars_schema)
|
|
966
|
+
return pl.DataFrame(schema=polars_schema)
|
|
968
967
|
|
|
969
|
-
#
|
|
970
|
-
|
|
971
|
-
return
|
|
968
|
+
# Build result dataframe directly from structured_results without joining
|
|
969
|
+
# Each entity is already tagged with path_column from its source PDF
|
|
970
|
+
return pl.DataFrame(structured_results, schema=polars_schema)
|
|
972
971
|
|
|
973
972
|
return document_paths.map_batches(structure_batch, schema=polars_schema, no_optimizations=True)
|
|
974
973
|
|
|
@@ -44,6 +44,7 @@ class Config(TypedDict, total=False):
|
|
|
44
44
|
"bedrock.claude-sonnet-4-bedrock",
|
|
45
45
|
"bedrock.claude-sonnet-4-5-bedrock",
|
|
46
46
|
"bedrock.claude-opus-4-5-bedrock",
|
|
47
|
+
"bedrock.claude-haiku-4-5-bedrock",
|
|
47
48
|
"gemini.gemini-2.5-pro",
|
|
48
49
|
"gemini.gemini-2.5-flash",
|
|
49
50
|
"gemini.gemini-3-pro-preview",
|
|
@@ -48,6 +48,7 @@ class Config(TypedDict, total=False):
|
|
|
48
48
|
"bedrock.claude-sonnet-4-bedrock",
|
|
49
49
|
"bedrock.claude-sonnet-4-5-bedrock",
|
|
50
50
|
"bedrock.claude-opus-4-5-bedrock",
|
|
51
|
+
"bedrock.claude-haiku-4-5-bedrock",
|
|
51
52
|
"gemini.gemini-2.5-pro",
|
|
52
53
|
"gemini.gemini-2.5-flash",
|
|
53
54
|
"gemini.gemini-3-pro-preview",
|
|
@@ -24,6 +24,7 @@ __all__ = [
|
|
|
24
24
|
"DatahubSchemasCreated",
|
|
25
25
|
"DatahubTablesProcessed",
|
|
26
26
|
"DatahubEmbeddingBatch",
|
|
27
|
+
"ViewedPdfPage",
|
|
27
28
|
]
|
|
28
29
|
|
|
29
30
|
|
|
@@ -164,6 +165,12 @@ class DatahubEmbeddingBatch(BaseModel):
|
|
|
164
165
|
total_batches: int
|
|
165
166
|
|
|
166
167
|
|
|
168
|
+
class ViewedPdfPage(BaseModel):
|
|
169
|
+
event_type: Literal["viewed_pdf_page"]
|
|
170
|
+
|
|
171
|
+
page_index: int
|
|
172
|
+
|
|
173
|
+
|
|
167
174
|
JobEventBody: TypeAlias = Annotated[
|
|
168
175
|
Union[
|
|
169
176
|
AgentNavigated,
|
|
@@ -181,6 +188,7 @@ JobEventBody: TypeAlias = Annotated[
|
|
|
181
188
|
DatahubSchemasCreated,
|
|
182
189
|
DatahubTablesProcessed,
|
|
183
190
|
DatahubEmbeddingBatch,
|
|
191
|
+
ViewedPdfPage,
|
|
184
192
|
],
|
|
185
193
|
PropertyInfo(discriminator="event_type"),
|
|
186
194
|
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: structifyai
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.173.0
|
|
4
4
|
Summary: The official Python library for the structify API
|
|
5
5
|
Project-URL: Homepage, https://github.com/StructifyAI/structify-python
|
|
6
6
|
Project-URL: Repository, https://github.com/StructifyAI/structify-python
|
|
@@ -11,7 +11,7 @@ structify/_resource.py,sha256=tJi4pDQooQZ_zJwEwrLj-U-ye2hC-cbmr1GzIwCT10Y,1118
|
|
|
11
11
|
structify/_response.py,sha256=RuNhMDiZUdPqEbmFJHDVI4FMPDszk8QjK9LVWm1Fagk,28806
|
|
12
12
|
structify/_streaming.py,sha256=n4C9M7ITmANYn9LaWHNoqJdIIyF7svLco2qst7u3M7U,10233
|
|
13
13
|
structify/_types.py,sha256=jj4p-m3vpUma0AdhPWIaljHZXeb4RKnrAusjVdpDy5Y,7597
|
|
14
|
-
structify/_version.py,sha256=
|
|
14
|
+
structify/_version.py,sha256=WcTGZuYh0GCneqsaePXe_535r8UvSvOWAg183seZe-g,163
|
|
15
15
|
structify/pagination.py,sha256=ycybhWcpKk4ztsMcCA6C0WZiJejGrSx6bSr8LLskJUY,4346
|
|
16
16
|
structify/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
structify/_utils/__init__.py,sha256=7fch0GT9zpNnErbciSpUNa-SjTxxjY6kxHxKMOM4AGs,2305
|
|
@@ -39,7 +39,7 @@ structify/resources/external_dataframe_proxy.py,sha256=DSn0YwWIembR__ZtDxVCJtyY3
|
|
|
39
39
|
structify/resources/jobs.py,sha256=gO1aSByi1dMvW90UDsMmNhLHFCOY4ENLkZcAx4gbLHY,30108
|
|
40
40
|
structify/resources/match.py,sha256=gDWEWnKwEoLbheQAMFltJCk2ysa_L9AuJMOaauM7c4Y,12248
|
|
41
41
|
structify/resources/nango.py,sha256=Zl0M1XhlVe61jHVd-SdYI9uEbEhIRmskvlk7Xp0Lh8g,9166
|
|
42
|
-
structify/resources/polars.py,sha256
|
|
42
|
+
structify/resources/polars.py,sha256=--kVVUoJ7AvGirnc462r0TRsK7lcXXDQD0XWhqtG3K8,61552
|
|
43
43
|
structify/resources/projects.py,sha256=YDikBDB9D1EXyZ2GyRx4GlpQ83snw51YlNuU1sLHqho,14117
|
|
44
44
|
structify/resources/public_sessions.py,sha256=_JmssE0MMjeGdxT0FWtrkcceSV4skeEkVGYeO2FkJes,9976
|
|
45
45
|
structify/resources/sandbox.py,sha256=Gc7uxZAOmbXA477UHvS244BokI1Tte_34xxIj0dC5PA,16984
|
|
@@ -90,7 +90,7 @@ structify/types/chat_admin_issue_found_params.py,sha256=N5YMgm6O-Yr3sXZb2sWDHdyN
|
|
|
90
90
|
structify/types/chat_copy_node_output_by_code_hash_params.py,sha256=gP4V2Y0_Rb4-kwoEceClJLgjVdWv7bgcI4qqQIvdNv8,356
|
|
91
91
|
structify/types/chat_copy_node_output_by_code_hash_response.py,sha256=Cst1RK9jDrMAKEyzLIk5_-D6YUbO-AIVllYui2sy2aQ,272
|
|
92
92
|
structify/types/chat_copy_params.py,sha256=tfA3jdduKDJeHiYjWWKdHifRMp1sCGyGQcYHFuMFdnE,425
|
|
93
|
-
structify/types/chat_create_session_params.py,sha256=
|
|
93
|
+
structify/types/chat_create_session_params.py,sha256=QSkL1Z0Vj0sT8AJ8mcDZQIqAwUkbhiQHuF6811veMmc,1810
|
|
94
94
|
structify/types/chat_delete_files_params.py,sha256=Rv24bWe1CK524xobl9-_APx5GG8KTqIwEtUgYIUlMXc,343
|
|
95
95
|
structify/types/chat_delete_files_response.py,sha256=ZoFJjfZqn_rVuwiFhsuFigN_AQHh_DRkb5KRj7J_49g,225
|
|
96
96
|
structify/types/chat_dependency.py,sha256=J8JLY6kBjFt4dgf-_Vk_HEjxS5R6_6VnTQue2JzZRug,415
|
|
@@ -114,7 +114,7 @@ structify/types/chat_update_session_favorite_params.py,sha256=UNi0YujSs1gCSry2jK
|
|
|
114
114
|
structify/types/chat_update_session_params.py,sha256=5NQZx-j-W2xQGlX0GU-9vwyQt0QunlifWY2MpRtrEv8,382
|
|
115
115
|
structify/types/chat_update_visibility_params.py,sha256=uWtgq4zb_TSO4hIsY0FmdWaljB_20wpjl6IQhRMibTg,361
|
|
116
116
|
structify/types/chat_visibility.py,sha256=GpTgjpcupE1z9Iul8fxOrlMoP_2wjQ_7Z0rJ2y9qO5Q,244
|
|
117
|
-
structify/types/code_generate_code_params.py,sha256=
|
|
117
|
+
structify/types/code_generate_code_params.py,sha256=cX5HYhmrv-9gBZcYMy9jJ8h4vMewt-Zy5jVfAYJsirA,2114
|
|
118
118
|
structify/types/code_interrupt_generation_params.py,sha256=1Y9VOgObIJFyYgAEkUuWZRKKV5-4HcoRA6p5iSEnF3s,410
|
|
119
119
|
structify/types/connector.py,sha256=hqfk8x1ZM39idvAd4wXLm1QNrnT3kRgxEuhk8O28-B0,1069
|
|
120
120
|
structify/types/connector_auth_method.py,sha256=iHBmcNbi74mDjFd_m4-HrGrZoV9_WRSFtrOY0fz9NhQ,562
|
|
@@ -259,7 +259,7 @@ structify/types/granularity.py,sha256=At6biWApGE7uE8jr5KnHP9Jr1yPFkuqLwXjExaswtB
|
|
|
259
259
|
structify/types/image.py,sha256=FpYU3gDZnet0wO17e2uHzcyRUD6E1ssSgv63Ew0DzjU,269
|
|
260
260
|
structify/types/invitation_details_response.py,sha256=TmyeM4mW4Kb6L0d7Ook9cH3g8vzfQYPnvZDIBdgVAO4,272
|
|
261
261
|
structify/types/job_cancel_response.py,sha256=y8M8qPkcXT-pTi4IwQ0JBJQzXeAQIs3u2OsaVeGBTtc,1224
|
|
262
|
-
structify/types/job_event_body.py,sha256=
|
|
262
|
+
structify/types/job_event_body.py,sha256=C3RfyefsBgWrFR_KAK7-Tzmlj6B3f4JVR6iR0HYt1Ks,3706
|
|
263
263
|
structify/types/job_get_scrapers_response.py,sha256=-E9DaG9YCkrJeuffZ3RiI6MW7ZsbPjmlVU3jUQdggSY,694
|
|
264
264
|
structify/types/job_get_source_entities_response.py,sha256=vlGKFkMKjII48EexIEBbls6-PAVYDDkmR1__aKjuiUo,3379
|
|
265
265
|
structify/types/job_list_params.py,sha256=i_MYi4vIFC6Dq5I4E4htYwZejqRE5qAf2lgg_SRBuKg,1012
|
|
@@ -489,7 +489,7 @@ structify/types/user/stripe_create_portal_session_params.py,sha256=5AYRC8z_SlKmd
|
|
|
489
489
|
structify/types/user/stripe_create_session_params.py,sha256=DFcNLNzEWeupkGQ9J5PafsuL_bIU9cLEIhAmFPsRlfo,387
|
|
490
490
|
structify/types/user/stripe_create_subscription_params.py,sha256=d8HfiC94gJbG-cC_WvBz6xYCvxKJO_EP2yyVmVvufrU,424
|
|
491
491
|
structify/types/user/subscription_plan.py,sha256=qKJMM-zPpYolYC1DlypOwPpxlyJBLkQqFK_0VpwktJs,222
|
|
492
|
-
structifyai-1.
|
|
493
|
-
structifyai-1.
|
|
494
|
-
structifyai-1.
|
|
495
|
-
structifyai-1.
|
|
492
|
+
structifyai-1.173.0.dist-info/METADATA,sha256=YYeHg00958d2UCUx9tOQB5WRAOIqsU3WZ-Ps3vxzKcU,16399
|
|
493
|
+
structifyai-1.173.0.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
|
|
494
|
+
structifyai-1.173.0.dist-info/licenses/LICENSE,sha256=9CwgrmGz3rZSTT-KqGc1gua-7g8B4ThTgMtUgPALh5c,11339
|
|
495
|
+
structifyai-1.173.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|