structifyai 1.178.0__py3-none-any.whl → 1.179.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
structify/_version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
2
 
3
3
  __title__ = "structify"
4
- __version__ = "1.178.0" # x-release-please-version
4
+ __version__ = "1.179.0" # x-release-please-version
@@ -16,7 +16,6 @@ from structify.types.entity_param import EntityParam
16
16
  from structify.types.property_type_param import PropertyTypeParam
17
17
  from structify.types.dataset_create_params import Relationship as CreateRelationshipParam
18
18
  from structify.types.knowledge_graph_param import KnowledgeGraphParam
19
- from structify.types.dataset_view_table_response import Properties
20
19
 
21
20
  from ..types import TableParam
22
21
  from .._compat import cached_property
@@ -35,6 +34,17 @@ from ..types.structure_run_async_params import SourceWebWeb
35
34
  __all__ = ["PolarsResource"]
36
35
 
37
36
  MAX_PARALLEL_REQUESTS = 20
37
+ STRUCTIFY_JOB_ID_COLUMN = "structify_job_id"
38
+
39
+
40
+ def _collect_entities_with_job_ids(entities: Any) -> List[Dict[str, Any]]:
41
+ """Collect entity properties with their first job_id."""
42
+ results: List[Dict[str, Any]] = []
43
+ for entity in entities:
44
+ row: Dict[str, Any] = dict(entity.properties)
45
+ row[STRUCTIFY_JOB_ID_COLUMN] = entity.job_ids[0] if entity.job_ids else None
46
+ results.append(row)
47
+ return results
38
48
 
39
49
 
40
50
  class PolarsResource(SyncAPIResource):
@@ -164,8 +174,9 @@ class PolarsResource(SyncAPIResource):
164
174
  # Get the node ID when the function is called, not when the batch is processed
165
175
  node_id = get_node_id()
166
176
 
167
- # Create the expected output schema
177
+ # Create the expected output schema with single job_id column
168
178
  expected_schema = properties_to_schema(all_properties)
179
+ expected_schema[STRUCTIFY_JOB_ID_COLUMN] = pl.String
169
180
 
170
181
  # Apply Structify enrich on the dataframe
171
182
  def enhance_batch(batch_df: pl.DataFrame) -> pl.DataFrame:
@@ -249,11 +260,10 @@ class PolarsResource(SyncAPIResource):
249
260
  # 3. Wait for all jobs to complete
250
261
  title = f"Enriching {property_names} for {dataframe_name}"
251
262
  self._client.jobs.wait_for_jobs(dataset_name=dataset_name, title=title, node_id=node_id)
252
- # 4. Collect the results
253
- results = [
254
- entity.properties
255
- for entity in self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name)
256
- ]
263
+ # 4. Collect the results with job_ids
264
+ results = _collect_entities_with_job_ids(
265
+ self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name)
266
+ )
257
267
  # 5. Return the results
258
268
  return pl.DataFrame(results, schema=expected_schema)
259
269
 
@@ -296,6 +306,7 @@ class PolarsResource(SyncAPIResource):
296
306
  target_columns[col_name] = col_info.get("type", pl.String())
297
307
 
298
308
  output_schema = _merge_schema_with_suffix(input_schema, target_columns, suffix=target_table_name)
309
+ output_schema[STRUCTIFY_JOB_ID_COLUMN] = pl.String
299
310
 
300
311
  target_properties: list[Property] = [
301
312
  Property(
@@ -412,6 +423,7 @@ class PolarsResource(SyncAPIResource):
412
423
  prop_name if prop_name not in input_schema else f"{prop_name}_{target_table_name}"
413
424
  ) # If the column already exists in the input schema, we need to suffix it with the target table name
414
425
  result_row[eff] = target_entity.properties.get(prop_name)
426
+ result_row[STRUCTIFY_JOB_ID_COLUMN] = target_entity.job_ids[0] if target_entity.job_ids else None
415
427
  result_rows.append(result_row)
416
428
 
417
429
  # Handle source rows without relationships
@@ -422,6 +434,7 @@ class PolarsResource(SyncAPIResource):
422
434
  for prop_name in target_schema.keys():
423
435
  eff = prop_name if prop_name not in input_schema else f"{prop_name}_{target_table_name}"
424
436
  orphan_row[eff] = None
437
+ orphan_row[STRUCTIFY_JOB_ID_COLUMN] = None
425
438
  result_rows.append(orphan_row)
426
439
 
427
440
  if not result_rows:
@@ -440,14 +453,11 @@ class PolarsResource(SyncAPIResource):
440
453
  dataframe_name: str,
441
454
  dataframe_description: str,
442
455
  use_proxy: bool = False,
443
- include_job_ids: bool = False,
444
456
  ) -> LazyFrame:
445
457
  """
446
458
  Enhance one or more columns of a `LazyFrame` directly from a URL.
447
459
 
448
- When `include_job_ids=True`, an additional `job_id` column is added to the
449
- output DataFrame with the Structify job id for each URL. The job id is not
450
- stored in Structify.
460
+ Adds a `structify_job_id` column with the job id for each row.
451
461
  """
452
462
 
453
463
  # Existing columns & their dtypes from the LazyFrame
@@ -475,8 +485,6 @@ class PolarsResource(SyncAPIResource):
475
485
  for col_name, (dtype, desc) in new_columns_dict.items()
476
486
  ]
477
487
 
478
- job_id_column: str | None = "job_id" if include_job_ids else None
479
-
480
488
  all_properties = merge_column_properties(pre_existing_properties, new_column_properties)
481
489
 
482
490
  dataset_name = f"enhance_{dataframe_name}_{uuid.uuid4().hex}"
@@ -504,10 +512,9 @@ class PolarsResource(SyncAPIResource):
504
512
  # Get the node ID when the function is called, not when the batch is processed
505
513
  node_id = get_node_id()
506
514
 
507
- # Create the expected output schema
515
+ # Create the expected output schema with single job_id column
508
516
  expected_schema = properties_to_schema(all_properties)
509
- if job_id_column is not None:
510
- expected_schema[job_id_column] = pl.String
517
+ expected_schema[STRUCTIFY_JOB_ID_COLUMN] = pl.String
511
518
 
512
519
  # Apply Structify scrape on the dataframe
513
520
  def scrape_batch(batch_df: pl.DataFrame) -> pl.DataFrame:
@@ -537,8 +544,6 @@ class PolarsResource(SyncAPIResource):
537
544
  entity_id_to_entity[entity_id] = entity
538
545
 
539
546
  # 2. Run scrape jobs for each entity
540
- job_ids_by_url: Dict[str, str] = {}
541
-
542
547
  def scrape_entity_property(entity_id: str) -> None:
543
548
  entity = entity_id_to_entity[entity_id]
544
549
  url = entity["properties"].get(url_column)
@@ -549,7 +554,7 @@ class PolarsResource(SyncAPIResource):
549
554
  f"URL column {url_column} must be of string type, got {type(entity['properties'][url_column])}"
550
555
  )
551
556
 
552
- response = self._client.scrape.scrape(
557
+ self._client.scrape.scrape(
553
558
  dataset_name=dataset_name,
554
559
  extraction_criteria=[
555
560
  RequiredProperty(
@@ -566,8 +571,6 @@ class PolarsResource(SyncAPIResource):
566
571
  use_proxy=use_proxy,
567
572
  url=url,
568
573
  )
569
- if job_id_column is not None:
570
- job_ids_by_url[url] = response.job_id
571
574
 
572
575
  property_list = list(new_columns_dict.keys())
573
576
  if len(property_list) == 1:
@@ -592,17 +595,10 @@ class PolarsResource(SyncAPIResource):
592
595
  title = f"Scraping {property_names} for {dataframe_name}"
593
596
  self._client.jobs.wait_for_jobs(dataset_name=dataset_name, title=title, node_id=node_id)
594
597
 
595
- # 4. Collect the results
596
- results: list[dict[str, Properties]] = []
597
- for entity in self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name):
598
- properties = entity.properties.copy()
599
- if job_id_column is not None:
600
- url = properties.get(url_column)
601
- if isinstance(url, str):
602
- job_id = job_ids_by_url.get(url)
603
- if job_id is not None:
604
- properties[job_id_column] = job_id
605
- results.append(properties)
598
+ # 4. Collect the results with job_id
599
+ results = _collect_entities_with_job_ids(
600
+ self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name)
601
+ )
606
602
 
607
603
  # 5. Return the results
608
604
  return pl.DataFrame(results, schema=expected_schema)
@@ -657,6 +653,7 @@ class PolarsResource(SyncAPIResource):
657
653
  }
658
654
 
659
655
  output_schema = _merge_schema_with_suffix(input_schema, scraped_columns, suffix=relationship["target_table"])
656
+ output_schema[STRUCTIFY_JOB_ID_COLUMN] = pl.String
660
657
 
661
658
  properties: list[Property] = []
662
659
  for col_name, col_info in scrape_schema.items():
@@ -758,6 +755,9 @@ class PolarsResource(SyncAPIResource):
758
755
  result_row: dict[str, Any] = {
759
756
  **scraped_entity.properties,
760
757
  url_column: related_entity.properties[url_column],
758
+ STRUCTIFY_JOB_ID_COLUMN: scraped_entity.job_ids[0]
759
+ if scraped_entity.job_ids
760
+ else None,
761
761
  }
762
762
  result_rows.append(result_row)
763
763
  offset += LIMIT
@@ -765,8 +765,11 @@ class PolarsResource(SyncAPIResource):
765
765
  break
766
766
  except Exception:
767
767
  break
768
- # Build scraped schema (pre-join, original names) incl. join column
769
- scraped_schema = scraped_columns | {url_column: input_schema[url_column]}
768
+ # Build scraped schema (pre-join, original names) incl. join column and job_id
769
+ scraped_schema: Dict[str, pl.DataType] = scraped_columns | {
770
+ url_column: input_schema[url_column],
771
+ STRUCTIFY_JOB_ID_COLUMN: pl.String(),
772
+ }
770
773
 
771
774
  # Fill missing columns in scraped results
772
775
  for result_row in result_rows:
@@ -839,6 +842,7 @@ class PolarsResource(SyncAPIResource):
839
842
  polars_schema = pl.Schema(
840
843
  [(path_column, pl.String())]
841
844
  + [(col_name, col_info.get("type", pl.String())) for col_name, col_info in schema.items()]
845
+ + [(STRUCTIFY_JOB_ID_COLUMN, pl.String())]
842
846
  )
843
847
 
844
848
  assert path_column in document_paths.collect_schema(), (
@@ -931,9 +935,15 @@ class PolarsResource(SyncAPIResource):
931
935
 
932
936
  # Get all of the entities with their job_ids
933
937
  entities = self._client.datasets.view_table(dataset=dataset_name, name=table_name)
934
- structured_results: List[Dict[str, Any]] = [
935
- {**entity.properties, path_column: job_to_pdf_path[entity.job_ids[0]]} for entity in entities
936
- ]
938
+ structured_results: List[Dict[str, Any]] = []
939
+ for entity in entities:
940
+ job_id = entity.job_ids[0] if entity.job_ids else None
941
+ result_row: Dict[str, Any] = {
942
+ **entity.properties,
943
+ path_column: job_to_pdf_path.get(job_id) if job_id else None,
944
+ STRUCTIFY_JOB_ID_COLUMN: job_id,
945
+ }
946
+ structured_results.append(result_row)
937
947
 
938
948
  # Ensure all columns are present with None for missing values
939
949
  for result_row in structured_results:
@@ -986,6 +996,7 @@ class PolarsResource(SyncAPIResource):
986
996
  all_properties = existing_properties + [new_property]
987
997
 
988
998
  expected_schema = properties_to_schema(all_properties)
999
+ expected_schema[STRUCTIFY_JOB_ID_COLUMN] = pl.String
989
1000
  if collected_df.is_empty():
990
1001
  return pl.DataFrame(schema=expected_schema).lazy()
991
1002
 
@@ -1024,12 +1035,12 @@ class PolarsResource(SyncAPIResource):
1024
1035
  node_id=node_id,
1025
1036
  )
1026
1037
 
1027
- # 3. Collect the results
1038
+ # 3. Collect the results with job_ids
1028
1039
  title = f"Tagging {new_property_name} for {dataframe_name}"
1029
1040
  self._client.jobs.wait_for_jobs(dataset_name=dataset_name, title=title, node_id=node_id)
1030
- results = [
1031
- entity.properties for entity in self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name)
1032
- ]
1041
+ results = _collect_entities_with_job_ids(
1042
+ self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name)
1043
+ )
1033
1044
 
1034
1045
  # 4. Return the results
1035
1046
  return pl.DataFrame(results, schema=expected_schema).lazy()
@@ -1157,6 +1168,7 @@ class PolarsResource(SyncAPIResource):
1157
1168
  "idx1": [match.target_entity_index for match in matches],
1158
1169
  "idx2": [match.source_entity_index for match in matches],
1159
1170
  "match_reason": [match.match_reason for match in matches],
1171
+ STRUCTIFY_JOB_ID_COLUMN: [match.job_id for match in matches],
1160
1172
  }
1161
1173
  else:
1162
1174
  # No swap, return as normal
@@ -1164,6 +1176,7 @@ class PolarsResource(SyncAPIResource):
1164
1176
  "idx1": [match.source_entity_index for match in matches],
1165
1177
  "idx2": [match.target_entity_index for match in matches],
1166
1178
  "match_reason": [match.match_reason for match in matches],
1179
+ STRUCTIFY_JOB_ID_COLUMN: [match.job_id for match in matches],
1167
1180
  }
1168
1181
 
1169
1182
  return pl.DataFrame(matches_in_schema).lazy()
@@ -1182,7 +1195,7 @@ class PolarsResource(SyncAPIResource):
1182
1195
  "/entity/upload_parquet",
1183
1196
  params={"dataset": dataset_name, "table_name": table_name},
1184
1197
  files={"file": ("data.parquet", parquet_bytes.getvalue(), "application/octet-stream")},
1185
- headers={"Authorization": f"Bearer {self._client.session_token}"},
1198
+ headers=self._client.auth_headers,
1186
1199
  )
1187
1200
  response.raise_for_status()
1188
1201
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: structifyai
3
- Version: 1.178.0
3
+ Version: 1.179.0
4
4
  Summary: The official Python library for the structify API
5
5
  Project-URL: Homepage, https://github.com/StructifyAI/structify-python
6
6
  Project-URL: Repository, https://github.com/StructifyAI/structify-python
@@ -11,7 +11,7 @@ structify/_resource.py,sha256=tJi4pDQooQZ_zJwEwrLj-U-ye2hC-cbmr1GzIwCT10Y,1118
11
11
  structify/_response.py,sha256=RuNhMDiZUdPqEbmFJHDVI4FMPDszk8QjK9LVWm1Fagk,28806
12
12
  structify/_streaming.py,sha256=n4C9M7ITmANYn9LaWHNoqJdIIyF7svLco2qst7u3M7U,10233
13
13
  structify/_types.py,sha256=jj4p-m3vpUma0AdhPWIaljHZXeb4RKnrAusjVdpDy5Y,7597
14
- structify/_version.py,sha256=oPHKgofoBsX3XsUo0vn74-C4ADz2Aq_wNd3D_a2VRiI,163
14
+ structify/_version.py,sha256=wgYCQOamvnt8XFcVc7rp8aKcLrFsK_GWWmssGhTAKiE,163
15
15
  structify/pagination.py,sha256=ycybhWcpKk4ztsMcCA6C0WZiJejGrSx6bSr8LLskJUY,4346
16
16
  structify/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  structify/_utils/__init__.py,sha256=7fch0GT9zpNnErbciSpUNa-SjTxxjY6kxHxKMOM4AGs,2305
@@ -39,7 +39,7 @@ structify/resources/external_dataframe_proxy.py,sha256=DSn0YwWIembR__ZtDxVCJtyY3
39
39
  structify/resources/jobs.py,sha256=gO1aSByi1dMvW90UDsMmNhLHFCOY4ENLkZcAx4gbLHY,30108
40
40
  structify/resources/match.py,sha256=gDWEWnKwEoLbheQAMFltJCk2ysa_L9AuJMOaauM7c4Y,12248
41
41
  structify/resources/nango.py,sha256=Zl0M1XhlVe61jHVd-SdYI9uEbEhIRmskvlk7Xp0Lh8g,9166
42
- structify/resources/polars.py,sha256=W7m8A-q8mdz5-hb3pqcO9q93j5ttMFCGJlEfGWDBFH0,60046
42
+ structify/resources/polars.py,sha256=9uluqVdLXu5ZpR9-v7B6i3TJVV47zKtILOJwAzz5lkU,60817
43
43
  structify/resources/projects.py,sha256=YDikBDB9D1EXyZ2GyRx4GlpQ83snw51YlNuU1sLHqho,14117
44
44
  structify/resources/public_sessions.py,sha256=_JmssE0MMjeGdxT0FWtrkcceSV4skeEkVGYeO2FkJes,9976
45
45
  structify/resources/sandbox.py,sha256=KgpZ623G6T_3_oovCgjlxO81M63NanMBAezVDdesOCc,12807
@@ -496,7 +496,7 @@ structify/types/user/stripe_create_portal_session_params.py,sha256=5AYRC8z_SlKmd
496
496
  structify/types/user/stripe_create_session_params.py,sha256=DFcNLNzEWeupkGQ9J5PafsuL_bIU9cLEIhAmFPsRlfo,387
497
497
  structify/types/user/stripe_create_subscription_params.py,sha256=d8HfiC94gJbG-cC_WvBz6xYCvxKJO_EP2yyVmVvufrU,424
498
498
  structify/types/user/subscription_plan.py,sha256=qKJMM-zPpYolYC1DlypOwPpxlyJBLkQqFK_0VpwktJs,222
499
- structifyai-1.178.0.dist-info/METADATA,sha256=YR3Xd2scGjvgDi48CePU9hhyhXkmwtXYDSxiuMoNrDQ,16399
500
- structifyai-1.178.0.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
501
- structifyai-1.178.0.dist-info/licenses/LICENSE,sha256=9CwgrmGz3rZSTT-KqGc1gua-7g8B4ThTgMtUgPALh5c,11339
502
- structifyai-1.178.0.dist-info/RECORD,,
499
+ structifyai-1.179.0.dist-info/METADATA,sha256=ZntSnSVPD0ghtVBXKP4ejLLRTP16yvtcTVfH2QFZCPw,16399
500
+ structifyai-1.179.0.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
501
+ structifyai-1.179.0.dist-info/licenses/LICENSE,sha256=9CwgrmGz3rZSTT-KqGc1gua-7g8B4ThTgMtUgPALh5c,11339
502
+ structifyai-1.179.0.dist-info/RECORD,,