structifyai 1.178.0__py3-none-any.whl → 1.180.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
structify/_version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2
2
 
3
3
  __title__ = "structify"
4
- __version__ = "1.178.0" # x-release-please-version
4
+ __version__ = "1.180.0" # x-release-please-version
@@ -2,7 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from typing import Dict, Optional
5
+ from typing import Any, Dict, Optional, cast
6
6
  from typing_extensions import Literal, overload
7
7
 
8
8
  import httpx
@@ -17,6 +17,7 @@ from ...types import (
17
17
  connector_create_secret_params,
18
18
  connector_search_tables_params,
19
19
  connector_update_column_params,
20
+ connector_add_schema_object_params,
20
21
  connector_get_explorer_chat_params,
21
22
  connector_list_with_snippets_params,
22
23
  connector_delete_schema_object_params,
@@ -53,6 +54,7 @@ from ...types.exploration_runs_response import ExplorationRunsResponse
53
54
  from ...types.connector_summaries_response import ConnectorSummariesResponse
54
55
  from ...types.delete_schema_object_response import DeleteSchemaObjectResponse
55
56
  from ...types.connector_search_tables_response import ConnectorSearchTablesResponse
57
+ from ...types.connector_add_schema_object_response import ConnectorAddSchemaObjectResponse
56
58
  from ...types.connector_list_with_snippets_response import ConnectorListWithSnippetsResponse
57
59
  from ...types.connector_get_clarification_requests_response import ConnectorGetClarificationRequestsResponse
58
60
 
@@ -270,6 +272,177 @@ class ConnectorsResource(SyncAPIResource):
270
272
  cast_to=NoneType,
271
273
  )
272
274
 
275
+ @overload
276
+ def add_schema_object(
277
+ self,
278
+ connector_id: str,
279
+ *,
280
+ name: str,
281
+ type: Literal["database"],
282
+ description: Optional[str] | Omit = omit,
283
+ notes: Optional[str] | Omit = omit,
284
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
285
+ # The extra values given here take precedence over values defined on the client or passed to this method.
286
+ extra_headers: Headers | None = None,
287
+ extra_query: Query | None = None,
288
+ extra_body: Body | None = None,
289
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
290
+ ) -> ConnectorAddSchemaObjectResponse:
291
+ """
292
+ Args:
293
+ extra_headers: Send extra headers
294
+
295
+ extra_query: Add additional query parameters to the request
296
+
297
+ extra_body: Add additional JSON properties to the request
298
+
299
+ timeout: Override the client-level default timeout for this request, in seconds
300
+ """
301
+ ...
302
+
303
+ @overload
304
+ def add_schema_object(
305
+ self,
306
+ connector_id: str,
307
+ *,
308
+ database_id: str,
309
+ name: str,
310
+ type: Literal["schema"],
311
+ description: Optional[str] | Omit = omit,
312
+ notes: Optional[str] | Omit = omit,
313
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
314
+ # The extra values given here take precedence over values defined on the client or passed to this method.
315
+ extra_headers: Headers | None = None,
316
+ extra_query: Query | None = None,
317
+ extra_body: Body | None = None,
318
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
319
+ ) -> ConnectorAddSchemaObjectResponse:
320
+ """
321
+ Args:
322
+ extra_headers: Send extra headers
323
+
324
+ extra_query: Add additional query parameters to the request
325
+
326
+ extra_body: Add additional JSON properties to the request
327
+
328
+ timeout: Override the client-level default timeout for this request, in seconds
329
+ """
330
+ ...
331
+
332
+ @overload
333
+ def add_schema_object(
334
+ self,
335
+ connector_id: str,
336
+ *,
337
+ name: str,
338
+ schema_id: str,
339
+ type: Literal["table"],
340
+ description: Optional[str] | Omit = omit,
341
+ endpoint: Optional[str] | Omit = omit,
342
+ notes: Optional[str] | Omit = omit,
343
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
344
+ # The extra values given here take precedence over values defined on the client or passed to this method.
345
+ extra_headers: Headers | None = None,
346
+ extra_query: Query | None = None,
347
+ extra_body: Body | None = None,
348
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
349
+ ) -> ConnectorAddSchemaObjectResponse:
350
+ """
351
+ Args:
352
+ extra_headers: Send extra headers
353
+
354
+ extra_query: Add additional query parameters to the request
355
+
356
+ extra_body: Add additional JSON properties to the request
357
+
358
+ timeout: Override the client-level default timeout for this request, in seconds
359
+ """
360
+ ...
361
+
362
+ @overload
363
+ def add_schema_object(
364
+ self,
365
+ connector_id: str,
366
+ *,
367
+ column_type: str,
368
+ name: str,
369
+ table_id: str,
370
+ type: Literal["column"],
371
+ notes: Optional[str] | Omit = omit,
372
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
373
+ # The extra values given here take precedence over values defined on the client or passed to this method.
374
+ extra_headers: Headers | None = None,
375
+ extra_query: Query | None = None,
376
+ extra_body: Body | None = None,
377
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
378
+ ) -> ConnectorAddSchemaObjectResponse:
379
+ """
380
+ Args:
381
+ extra_headers: Send extra headers
382
+
383
+ extra_query: Add additional query parameters to the request
384
+
385
+ extra_body: Add additional JSON properties to the request
386
+
387
+ timeout: Override the client-level default timeout for this request, in seconds
388
+ """
389
+ ...
390
+
391
+ @required_args(
392
+ ["name", "type"],
393
+ ["database_id", "name", "type"],
394
+ ["name", "schema_id", "type"],
395
+ ["column_type", "name", "table_id", "type"],
396
+ )
397
+ def add_schema_object(
398
+ self,
399
+ connector_id: str,
400
+ *,
401
+ name: str,
402
+ type: Literal["database"] | Literal["schema"] | Literal["table"] | Literal["column"],
403
+ description: Optional[str] | Omit = omit,
404
+ notes: Optional[str] | Omit = omit,
405
+ database_id: str | Omit = omit,
406
+ schema_id: str | Omit = omit,
407
+ endpoint: Optional[str] | Omit = omit,
408
+ column_type: str | Omit = omit,
409
+ table_id: str | Omit = omit,
410
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
411
+ # The extra values given here take precedence over values defined on the client or passed to this method.
412
+ extra_headers: Headers | None = None,
413
+ extra_query: Query | None = None,
414
+ extra_body: Body | None = None,
415
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
416
+ ) -> ConnectorAddSchemaObjectResponse:
417
+ if not connector_id:
418
+ raise ValueError(f"Expected a non-empty value for `connector_id` but received {connector_id!r}")
419
+ return cast(
420
+ ConnectorAddSchemaObjectResponse,
421
+ self._post(
422
+ f"/connectors/{connector_id}/schema_object",
423
+ body=maybe_transform(
424
+ {
425
+ "name": name,
426
+ "type": type,
427
+ "description": description,
428
+ "notes": notes,
429
+ "database_id": database_id,
430
+ "schema_id": schema_id,
431
+ "endpoint": endpoint,
432
+ "column_type": column_type,
433
+ "table_id": table_id,
434
+ },
435
+ connector_add_schema_object_params.ConnectorAddSchemaObjectParams,
436
+ ),
437
+ options=make_request_options(
438
+ extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
439
+ ),
440
+ cast_to=cast(
441
+ Any, ConnectorAddSchemaObjectResponse
442
+ ), # Union types cannot be passed in as arguments in the type system
443
+ ),
444
+ )
445
+
273
446
  def create_secret(
274
447
  self,
275
448
  connector_id: str,
@@ -1207,6 +1380,177 @@ class AsyncConnectorsResource(AsyncAPIResource):
1207
1380
  cast_to=NoneType,
1208
1381
  )
1209
1382
 
1383
+ @overload
1384
+ async def add_schema_object(
1385
+ self,
1386
+ connector_id: str,
1387
+ *,
1388
+ name: str,
1389
+ type: Literal["database"],
1390
+ description: Optional[str] | Omit = omit,
1391
+ notes: Optional[str] | Omit = omit,
1392
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
1393
+ # The extra values given here take precedence over values defined on the client or passed to this method.
1394
+ extra_headers: Headers | None = None,
1395
+ extra_query: Query | None = None,
1396
+ extra_body: Body | None = None,
1397
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
1398
+ ) -> ConnectorAddSchemaObjectResponse:
1399
+ """
1400
+ Args:
1401
+ extra_headers: Send extra headers
1402
+
1403
+ extra_query: Add additional query parameters to the request
1404
+
1405
+ extra_body: Add additional JSON properties to the request
1406
+
1407
+ timeout: Override the client-level default timeout for this request, in seconds
1408
+ """
1409
+ ...
1410
+
1411
+ @overload
1412
+ async def add_schema_object(
1413
+ self,
1414
+ connector_id: str,
1415
+ *,
1416
+ database_id: str,
1417
+ name: str,
1418
+ type: Literal["schema"],
1419
+ description: Optional[str] | Omit = omit,
1420
+ notes: Optional[str] | Omit = omit,
1421
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
1422
+ # The extra values given here take precedence over values defined on the client or passed to this method.
1423
+ extra_headers: Headers | None = None,
1424
+ extra_query: Query | None = None,
1425
+ extra_body: Body | None = None,
1426
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
1427
+ ) -> ConnectorAddSchemaObjectResponse:
1428
+ """
1429
+ Args:
1430
+ extra_headers: Send extra headers
1431
+
1432
+ extra_query: Add additional query parameters to the request
1433
+
1434
+ extra_body: Add additional JSON properties to the request
1435
+
1436
+ timeout: Override the client-level default timeout for this request, in seconds
1437
+ """
1438
+ ...
1439
+
1440
+ @overload
1441
+ async def add_schema_object(
1442
+ self,
1443
+ connector_id: str,
1444
+ *,
1445
+ name: str,
1446
+ schema_id: str,
1447
+ type: Literal["table"],
1448
+ description: Optional[str] | Omit = omit,
1449
+ endpoint: Optional[str] | Omit = omit,
1450
+ notes: Optional[str] | Omit = omit,
1451
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
1452
+ # The extra values given here take precedence over values defined on the client or passed to this method.
1453
+ extra_headers: Headers | None = None,
1454
+ extra_query: Query | None = None,
1455
+ extra_body: Body | None = None,
1456
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
1457
+ ) -> ConnectorAddSchemaObjectResponse:
1458
+ """
1459
+ Args:
1460
+ extra_headers: Send extra headers
1461
+
1462
+ extra_query: Add additional query parameters to the request
1463
+
1464
+ extra_body: Add additional JSON properties to the request
1465
+
1466
+ timeout: Override the client-level default timeout for this request, in seconds
1467
+ """
1468
+ ...
1469
+
1470
+ @overload
1471
+ async def add_schema_object(
1472
+ self,
1473
+ connector_id: str,
1474
+ *,
1475
+ column_type: str,
1476
+ name: str,
1477
+ table_id: str,
1478
+ type: Literal["column"],
1479
+ notes: Optional[str] | Omit = omit,
1480
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
1481
+ # The extra values given here take precedence over values defined on the client or passed to this method.
1482
+ extra_headers: Headers | None = None,
1483
+ extra_query: Query | None = None,
1484
+ extra_body: Body | None = None,
1485
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
1486
+ ) -> ConnectorAddSchemaObjectResponse:
1487
+ """
1488
+ Args:
1489
+ extra_headers: Send extra headers
1490
+
1491
+ extra_query: Add additional query parameters to the request
1492
+
1493
+ extra_body: Add additional JSON properties to the request
1494
+
1495
+ timeout: Override the client-level default timeout for this request, in seconds
1496
+ """
1497
+ ...
1498
+
1499
+ @required_args(
1500
+ ["name", "type"],
1501
+ ["database_id", "name", "type"],
1502
+ ["name", "schema_id", "type"],
1503
+ ["column_type", "name", "table_id", "type"],
1504
+ )
1505
+ async def add_schema_object(
1506
+ self,
1507
+ connector_id: str,
1508
+ *,
1509
+ name: str,
1510
+ type: Literal["database"] | Literal["schema"] | Literal["table"] | Literal["column"],
1511
+ description: Optional[str] | Omit = omit,
1512
+ notes: Optional[str] | Omit = omit,
1513
+ database_id: str | Omit = omit,
1514
+ schema_id: str | Omit = omit,
1515
+ endpoint: Optional[str] | Omit = omit,
1516
+ column_type: str | Omit = omit,
1517
+ table_id: str | Omit = omit,
1518
+ # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
1519
+ # The extra values given here take precedence over values defined on the client or passed to this method.
1520
+ extra_headers: Headers | None = None,
1521
+ extra_query: Query | None = None,
1522
+ extra_body: Body | None = None,
1523
+ timeout: float | httpx.Timeout | None | NotGiven = not_given,
1524
+ ) -> ConnectorAddSchemaObjectResponse:
1525
+ if not connector_id:
1526
+ raise ValueError(f"Expected a non-empty value for `connector_id` but received {connector_id!r}")
1527
+ return cast(
1528
+ ConnectorAddSchemaObjectResponse,
1529
+ await self._post(
1530
+ f"/connectors/{connector_id}/schema_object",
1531
+ body=await async_maybe_transform(
1532
+ {
1533
+ "name": name,
1534
+ "type": type,
1535
+ "description": description,
1536
+ "notes": notes,
1537
+ "database_id": database_id,
1538
+ "schema_id": schema_id,
1539
+ "endpoint": endpoint,
1540
+ "column_type": column_type,
1541
+ "table_id": table_id,
1542
+ },
1543
+ connector_add_schema_object_params.ConnectorAddSchemaObjectParams,
1544
+ ),
1545
+ options=make_request_options(
1546
+ extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
1547
+ ),
1548
+ cast_to=cast(
1549
+ Any, ConnectorAddSchemaObjectResponse
1550
+ ), # Union types cannot be passed in as arguments in the type system
1551
+ ),
1552
+ )
1553
+
1210
1554
  async def create_secret(
1211
1555
  self,
1212
1556
  connector_id: str,
@@ -1951,6 +2295,9 @@ class ConnectorsResourceWithRawResponse:
1951
2295
  self.delete = to_raw_response_wrapper(
1952
2296
  connectors.delete,
1953
2297
  )
2298
+ self.add_schema_object = to_raw_response_wrapper(
2299
+ connectors.add_schema_object,
2300
+ )
1954
2301
  self.create_secret = to_raw_response_wrapper(
1955
2302
  connectors.create_secret,
1956
2303
  )
@@ -2024,6 +2371,9 @@ class AsyncConnectorsResourceWithRawResponse:
2024
2371
  self.delete = async_to_raw_response_wrapper(
2025
2372
  connectors.delete,
2026
2373
  )
2374
+ self.add_schema_object = async_to_raw_response_wrapper(
2375
+ connectors.add_schema_object,
2376
+ )
2027
2377
  self.create_secret = async_to_raw_response_wrapper(
2028
2378
  connectors.create_secret,
2029
2379
  )
@@ -2097,6 +2447,9 @@ class ConnectorsResourceWithStreamingResponse:
2097
2447
  self.delete = to_streamed_response_wrapper(
2098
2448
  connectors.delete,
2099
2449
  )
2450
+ self.add_schema_object = to_streamed_response_wrapper(
2451
+ connectors.add_schema_object,
2452
+ )
2100
2453
  self.create_secret = to_streamed_response_wrapper(
2101
2454
  connectors.create_secret,
2102
2455
  )
@@ -2170,6 +2523,9 @@ class AsyncConnectorsResourceWithStreamingResponse:
2170
2523
  self.delete = async_to_streamed_response_wrapper(
2171
2524
  connectors.delete,
2172
2525
  )
2526
+ self.add_schema_object = async_to_streamed_response_wrapper(
2527
+ connectors.add_schema_object,
2528
+ )
2173
2529
  self.create_secret = async_to_streamed_response_wrapper(
2174
2530
  connectors.create_secret,
2175
2531
  )
@@ -16,7 +16,6 @@ from structify.types.entity_param import EntityParam
16
16
  from structify.types.property_type_param import PropertyTypeParam
17
17
  from structify.types.dataset_create_params import Relationship as CreateRelationshipParam
18
18
  from structify.types.knowledge_graph_param import KnowledgeGraphParam
19
- from structify.types.dataset_view_table_response import Properties
20
19
 
21
20
  from ..types import TableParam
22
21
  from .._compat import cached_property
@@ -35,6 +34,17 @@ from ..types.structure_run_async_params import SourceWebWeb
35
34
  __all__ = ["PolarsResource"]
36
35
 
37
36
  MAX_PARALLEL_REQUESTS = 20
37
+ STRUCTIFY_JOB_ID_COLUMN = "structify_job_id"
38
+
39
+
40
+ def _collect_entities_with_job_ids(entities: Any) -> List[Dict[str, Any]]:
41
+ """Collect entity properties with their first job_id."""
42
+ results: List[Dict[str, Any]] = []
43
+ for entity in entities:
44
+ row: Dict[str, Any] = dict(entity.properties)
45
+ row[STRUCTIFY_JOB_ID_COLUMN] = entity.job_ids[0] if entity.job_ids else None
46
+ results.append(row)
47
+ return results
38
48
 
39
49
 
40
50
  class PolarsResource(SyncAPIResource):
@@ -164,8 +174,9 @@ class PolarsResource(SyncAPIResource):
164
174
  # Get the node ID when the function is called, not when the batch is processed
165
175
  node_id = get_node_id()
166
176
 
167
- # Create the expected output schema
177
+ # Create the expected output schema with single job_id column
168
178
  expected_schema = properties_to_schema(all_properties)
179
+ expected_schema[STRUCTIFY_JOB_ID_COLUMN] = pl.String
169
180
 
170
181
  # Apply Structify enrich on the dataframe
171
182
  def enhance_batch(batch_df: pl.DataFrame) -> pl.DataFrame:
@@ -249,11 +260,10 @@ class PolarsResource(SyncAPIResource):
249
260
  # 3. Wait for all jobs to complete
250
261
  title = f"Enriching {property_names} for {dataframe_name}"
251
262
  self._client.jobs.wait_for_jobs(dataset_name=dataset_name, title=title, node_id=node_id)
252
- # 4. Collect the results
253
- results = [
254
- entity.properties
255
- for entity in self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name)
256
- ]
263
+ # 4. Collect the results with job_ids
264
+ results = _collect_entities_with_job_ids(
265
+ self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name)
266
+ )
257
267
  # 5. Return the results
258
268
  return pl.DataFrame(results, schema=expected_schema)
259
269
 
@@ -296,6 +306,7 @@ class PolarsResource(SyncAPIResource):
296
306
  target_columns[col_name] = col_info.get("type", pl.String())
297
307
 
298
308
  output_schema = _merge_schema_with_suffix(input_schema, target_columns, suffix=target_table_name)
309
+ output_schema[STRUCTIFY_JOB_ID_COLUMN] = pl.String
299
310
 
300
311
  target_properties: list[Property] = [
301
312
  Property(
@@ -412,6 +423,7 @@ class PolarsResource(SyncAPIResource):
412
423
  prop_name if prop_name not in input_schema else f"{prop_name}_{target_table_name}"
413
424
  ) # If the column already exists in the input schema, we need to suffix it with the target table name
414
425
  result_row[eff] = target_entity.properties.get(prop_name)
426
+ result_row[STRUCTIFY_JOB_ID_COLUMN] = target_entity.job_ids[0] if target_entity.job_ids else None
415
427
  result_rows.append(result_row)
416
428
 
417
429
  # Handle source rows without relationships
@@ -422,6 +434,7 @@ class PolarsResource(SyncAPIResource):
422
434
  for prop_name in target_schema.keys():
423
435
  eff = prop_name if prop_name not in input_schema else f"{prop_name}_{target_table_name}"
424
436
  orphan_row[eff] = None
437
+ orphan_row[STRUCTIFY_JOB_ID_COLUMN] = None
425
438
  result_rows.append(orphan_row)
426
439
 
427
440
  if not result_rows:
@@ -440,14 +453,11 @@ class PolarsResource(SyncAPIResource):
440
453
  dataframe_name: str,
441
454
  dataframe_description: str,
442
455
  use_proxy: bool = False,
443
- include_job_ids: bool = False,
444
456
  ) -> LazyFrame:
445
457
  """
446
458
  Enhance one or more columns of a `LazyFrame` directly from a URL.
447
459
 
448
- When `include_job_ids=True`, an additional `job_id` column is added to the
449
- output DataFrame with the Structify job id for each URL. The job id is not
450
- stored in Structify.
460
+ Adds a `structify_job_id` column with the job id for each row.
451
461
  """
452
462
 
453
463
  # Existing columns & their dtypes from the LazyFrame
@@ -475,8 +485,6 @@ class PolarsResource(SyncAPIResource):
475
485
  for col_name, (dtype, desc) in new_columns_dict.items()
476
486
  ]
477
487
 
478
- job_id_column: str | None = "job_id" if include_job_ids else None
479
-
480
488
  all_properties = merge_column_properties(pre_existing_properties, new_column_properties)
481
489
 
482
490
  dataset_name = f"enhance_{dataframe_name}_{uuid.uuid4().hex}"
@@ -504,10 +512,9 @@ class PolarsResource(SyncAPIResource):
504
512
  # Get the node ID when the function is called, not when the batch is processed
505
513
  node_id = get_node_id()
506
514
 
507
- # Create the expected output schema
515
+ # Create the expected output schema with single job_id column
508
516
  expected_schema = properties_to_schema(all_properties)
509
- if job_id_column is not None:
510
- expected_schema[job_id_column] = pl.String
517
+ expected_schema[STRUCTIFY_JOB_ID_COLUMN] = pl.String
511
518
 
512
519
  # Apply Structify scrape on the dataframe
513
520
  def scrape_batch(batch_df: pl.DataFrame) -> pl.DataFrame:
@@ -537,8 +544,6 @@ class PolarsResource(SyncAPIResource):
537
544
  entity_id_to_entity[entity_id] = entity
538
545
 
539
546
  # 2. Run scrape jobs for each entity
540
- job_ids_by_url: Dict[str, str] = {}
541
-
542
547
  def scrape_entity_property(entity_id: str) -> None:
543
548
  entity = entity_id_to_entity[entity_id]
544
549
  url = entity["properties"].get(url_column)
@@ -549,7 +554,7 @@ class PolarsResource(SyncAPIResource):
549
554
  f"URL column {url_column} must be of string type, got {type(entity['properties'][url_column])}"
550
555
  )
551
556
 
552
- response = self._client.scrape.scrape(
557
+ self._client.scrape.scrape(
553
558
  dataset_name=dataset_name,
554
559
  extraction_criteria=[
555
560
  RequiredProperty(
@@ -566,8 +571,6 @@ class PolarsResource(SyncAPIResource):
566
571
  use_proxy=use_proxy,
567
572
  url=url,
568
573
  )
569
- if job_id_column is not None:
570
- job_ids_by_url[url] = response.job_id
571
574
 
572
575
  property_list = list(new_columns_dict.keys())
573
576
  if len(property_list) == 1:
@@ -592,17 +595,10 @@ class PolarsResource(SyncAPIResource):
592
595
  title = f"Scraping {property_names} for {dataframe_name}"
593
596
  self._client.jobs.wait_for_jobs(dataset_name=dataset_name, title=title, node_id=node_id)
594
597
 
595
- # 4. Collect the results
596
- results: list[dict[str, Properties]] = []
597
- for entity in self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name):
598
- properties = entity.properties.copy()
599
- if job_id_column is not None:
600
- url = properties.get(url_column)
601
- if isinstance(url, str):
602
- job_id = job_ids_by_url.get(url)
603
- if job_id is not None:
604
- properties[job_id_column] = job_id
605
- results.append(properties)
598
+ # 4. Collect the results with job_id
599
+ results = _collect_entities_with_job_ids(
600
+ self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name)
601
+ )
606
602
 
607
603
  # 5. Return the results
608
604
  return pl.DataFrame(results, schema=expected_schema)
@@ -657,6 +653,7 @@ class PolarsResource(SyncAPIResource):
657
653
  }
658
654
 
659
655
  output_schema = _merge_schema_with_suffix(input_schema, scraped_columns, suffix=relationship["target_table"])
656
+ output_schema[STRUCTIFY_JOB_ID_COLUMN] = pl.String
660
657
 
661
658
  properties: list[Property] = []
662
659
  for col_name, col_info in scrape_schema.items():
@@ -758,6 +755,9 @@ class PolarsResource(SyncAPIResource):
758
755
  result_row: dict[str, Any] = {
759
756
  **scraped_entity.properties,
760
757
  url_column: related_entity.properties[url_column],
758
+ STRUCTIFY_JOB_ID_COLUMN: scraped_entity.job_ids[0]
759
+ if scraped_entity.job_ids
760
+ else None,
761
761
  }
762
762
  result_rows.append(result_row)
763
763
  offset += LIMIT
@@ -765,8 +765,11 @@ class PolarsResource(SyncAPIResource):
765
765
  break
766
766
  except Exception:
767
767
  break
768
- # Build scraped schema (pre-join, original names) incl. join column
769
- scraped_schema = scraped_columns | {url_column: input_schema[url_column]}
768
+ # Build scraped schema (pre-join, original names) incl. join column and job_id
769
+ scraped_schema: Dict[str, pl.DataType] = scraped_columns | {
770
+ url_column: input_schema[url_column],
771
+ STRUCTIFY_JOB_ID_COLUMN: pl.String(),
772
+ }
770
773
 
771
774
  # Fill missing columns in scraped results
772
775
  for result_row in result_rows:
@@ -839,6 +842,7 @@ class PolarsResource(SyncAPIResource):
839
842
  polars_schema = pl.Schema(
840
843
  [(path_column, pl.String())]
841
844
  + [(col_name, col_info.get("type", pl.String())) for col_name, col_info in schema.items()]
845
+ + [(STRUCTIFY_JOB_ID_COLUMN, pl.String())]
842
846
  )
843
847
 
844
848
  assert path_column in document_paths.collect_schema(), (
@@ -931,9 +935,15 @@ class PolarsResource(SyncAPIResource):
931
935
 
932
936
  # Get all of the entities with their job_ids
933
937
  entities = self._client.datasets.view_table(dataset=dataset_name, name=table_name)
934
- structured_results: List[Dict[str, Any]] = [
935
- {**entity.properties, path_column: job_to_pdf_path[entity.job_ids[0]]} for entity in entities
936
- ]
938
+ structured_results: List[Dict[str, Any]] = []
939
+ for entity in entities:
940
+ job_id = entity.job_ids[0] if entity.job_ids else None
941
+ result_row: Dict[str, Any] = {
942
+ **entity.properties,
943
+ path_column: job_to_pdf_path.get(job_id) if job_id else None,
944
+ STRUCTIFY_JOB_ID_COLUMN: job_id,
945
+ }
946
+ structured_results.append(result_row)
937
947
 
938
948
  # Ensure all columns are present with None for missing values
939
949
  for result_row in structured_results:
@@ -986,6 +996,7 @@ class PolarsResource(SyncAPIResource):
986
996
  all_properties = existing_properties + [new_property]
987
997
 
988
998
  expected_schema = properties_to_schema(all_properties)
999
+ expected_schema[STRUCTIFY_JOB_ID_COLUMN] = pl.String
989
1000
  if collected_df.is_empty():
990
1001
  return pl.DataFrame(schema=expected_schema).lazy()
991
1002
 
@@ -1024,12 +1035,12 @@ class PolarsResource(SyncAPIResource):
1024
1035
  node_id=node_id,
1025
1036
  )
1026
1037
 
1027
- # 3. Collect the results
1038
+ # 3. Collect the results with job_ids
1028
1039
  title = f"Tagging {new_property_name} for {dataframe_name}"
1029
1040
  self._client.jobs.wait_for_jobs(dataset_name=dataset_name, title=title, node_id=node_id)
1030
- results = [
1031
- entity.properties for entity in self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name)
1032
- ]
1041
+ results = _collect_entities_with_job_ids(
1042
+ self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name)
1043
+ )
1033
1044
 
1034
1045
  # 4. Return the results
1035
1046
  return pl.DataFrame(results, schema=expected_schema).lazy()
@@ -1157,6 +1168,7 @@ class PolarsResource(SyncAPIResource):
1157
1168
  "idx1": [match.target_entity_index for match in matches],
1158
1169
  "idx2": [match.source_entity_index for match in matches],
1159
1170
  "match_reason": [match.match_reason for match in matches],
1171
+ STRUCTIFY_JOB_ID_COLUMN: [match.job_id for match in matches],
1160
1172
  }
1161
1173
  else:
1162
1174
  # No swap, return as normal
@@ -1164,6 +1176,7 @@ class PolarsResource(SyncAPIResource):
1164
1176
  "idx1": [match.source_entity_index for match in matches],
1165
1177
  "idx2": [match.target_entity_index for match in matches],
1166
1178
  "match_reason": [match.match_reason for match in matches],
1179
+ STRUCTIFY_JOB_ID_COLUMN: [match.job_id for match in matches],
1167
1180
  }
1168
1181
 
1169
1182
  return pl.DataFrame(matches_in_schema).lazy()
@@ -1182,7 +1195,7 @@ class PolarsResource(SyncAPIResource):
1182
1195
  "/entity/upload_parquet",
1183
1196
  params={"dataset": dataset_name, "table_name": table_name},
1184
1197
  files={"file": ("data.parquet", parquet_bytes.getvalue(), "application/octet-stream")},
1185
- headers={"Authorization": f"Bearer {self._client.session_token}"},
1198
+ headers=self._client.auth_headers,
1186
1199
  )
1187
1200
  response.raise_for_status()
1188
1201