PyPI - structifyai - Versions diffs - 1.178.0__py3-none-any.whl → 1.180.0__py3-none-any.whl - Mend

structifyai 1.178.0py3-none-any.whl → 1.180.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

structify/_version.py +1 -1
structify/resources/connectors/connectors.py +357 -1
structify/resources/polars.py +55 -42
structify/resources/slack.py +8 -8
structify/resources/wiki.py +23 -18
structify/types/__init__.py +4 -1
structify/types/chat_create_session_params.py +1 -0
structify/types/code_generate_code_params.py +1 -0
structify/types/connector_add_schema_object_params.py +59 -0
structify/types/connector_add_schema_object_response.py +35 -0
structify/types/llm_information_store.py +4 -0
structify/types/slack_event_payload_param.py +2 -2
structify/types/slack_events_params.py +2 -2
structify/types/wiki_create_params.py +1 -2
structify/types/wiki_create_response.py +23 -0
structify/types/wiki_list_response.py +22 -3
structify/types/wiki_page_with_references.py +18 -2
structify/types/wiki_update_params.py +4 -2
structify/types/wiki_update_response.py +23 -0
{structifyai-1.178.0.dist-info → structifyai-1.180.0.dist-info}/METADATA +1 -1
{structifyai-1.178.0.dist-info → structifyai-1.180.0.dist-info}/RECORD +23 -20
structify/types/team_wiki_page.py +0 -28
{structifyai-1.178.0.dist-info → structifyai-1.180.0.dist-info}/WHEEL +0 -0
{structifyai-1.178.0.dist-info → structifyai-1.180.0.dist-info}/licenses/LICENSE +0 -0

structify/_version.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 __title__ = "structify"
-__version__ = "1.178.0"  # x-release-please-version
+__version__ = "1.180.0"  # x-release-please-version

structify/resources/connectors/connectors.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from __future__ import annotations
-from typing import Dict, Optional
+from typing import Any, Dict, Optional, cast
 from typing_extensions import Literal, overload
 import httpx
@@ -17,6 +17,7 @@ from ...types import (
     connector_create_secret_params,
     connector_search_tables_params,
     connector_update_column_params,
+    connector_add_schema_object_params,
     connector_get_explorer_chat_params,
     connector_list_with_snippets_params,
     connector_delete_schema_object_params,
@@ -53,6 +54,7 @@ from ...types.exploration_runs_response import ExplorationRunsResponse
 from ...types.connector_summaries_response import ConnectorSummariesResponse
 from ...types.delete_schema_object_response import DeleteSchemaObjectResponse
 from ...types.connector_search_tables_response import ConnectorSearchTablesResponse
+from ...types.connector_add_schema_object_response import ConnectorAddSchemaObjectResponse
 from ...types.connector_list_with_snippets_response import ConnectorListWithSnippetsResponse
 from ...types.connector_get_clarification_requests_response import ConnectorGetClarificationRequestsResponse
@@ -270,6 +272,177 @@ class ConnectorsResource(SyncAPIResource):
             cast_to=NoneType,
         )
+    @overload
+    def add_schema_object(
+        self,
+        connector_id: str,
+        *,
+        name: str,
+        type: Literal["database"],
+        description: Optional[str] | Omit = omit,
+        notes: Optional[str] | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> ConnectorAddSchemaObjectResponse:
+        """
+        Args:
+          extra_headers: Send extra headers
+          extra_query: Add additional query parameters to the request
+          extra_body: Add additional JSON properties to the request
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+    @overload
+    def add_schema_object(
+        self,
+        connector_id: str,
+        *,
+        database_id: str,
+        name: str,
+        type: Literal["schema"],
+        description: Optional[str] | Omit = omit,
+        notes: Optional[str] | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> ConnectorAddSchemaObjectResponse:
+        """
+        Args:
+          extra_headers: Send extra headers
+          extra_query: Add additional query parameters to the request
+          extra_body: Add additional JSON properties to the request
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+    @overload
+    def add_schema_object(
+        self,
+        connector_id: str,
+        *,
+        name: str,
+        schema_id: str,
+        type: Literal["table"],
+        description: Optional[str] | Omit = omit,
+        endpoint: Optional[str] | Omit = omit,
+        notes: Optional[str] | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> ConnectorAddSchemaObjectResponse:
+        """
+        Args:
+          extra_headers: Send extra headers
+          extra_query: Add additional query parameters to the request
+          extra_body: Add additional JSON properties to the request
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+    @overload
+    def add_schema_object(
+        self,
+        connector_id: str,
+        *,
+        column_type: str,
+        name: str,
+        table_id: str,
+        type: Literal["column"],
+        notes: Optional[str] | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> ConnectorAddSchemaObjectResponse:
+        """
+        Args:
+          extra_headers: Send extra headers
+          extra_query: Add additional query parameters to the request
+          extra_body: Add additional JSON properties to the request
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+    @required_args(
+        ["name", "type"],
+        ["database_id", "name", "type"],
+        ["name", "schema_id", "type"],
+        ["column_type", "name", "table_id", "type"],
+    )
+    def add_schema_object(
+        self,
+        connector_id: str,
+        *,
+        name: str,
+        type: Literal["database"] | Literal["schema"] | Literal["table"] | Literal["column"],
+        description: Optional[str] | Omit = omit,
+        notes: Optional[str] | Omit = omit,
+        database_id: str | Omit = omit,
+        schema_id: str | Omit = omit,
+        endpoint: Optional[str] | Omit = omit,
+        column_type: str | Omit = omit,
+        table_id: str | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> ConnectorAddSchemaObjectResponse:
+        if not connector_id:
+            raise ValueError(f"Expected a non-empty value for `connector_id` but received {connector_id!r}")
+        return cast(
+            ConnectorAddSchemaObjectResponse,
+            self._post(
+                f"/connectors/{connector_id}/schema_object",
+                body=maybe_transform(
+                    {
+                        "name": name,
+                        "type": type,
+                        "description": description,
+                        "notes": notes,
+                        "database_id": database_id,
+                        "schema_id": schema_id,
+                        "endpoint": endpoint,
+                        "column_type": column_type,
+                        "table_id": table_id,
+                    },
+                    connector_add_schema_object_params.ConnectorAddSchemaObjectParams,
+                ),
+                options=make_request_options(
+                    extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+                ),
+                cast_to=cast(
+                    Any, ConnectorAddSchemaObjectResponse
+                ),  # Union types cannot be passed in as arguments in the type system
+            ),
+        )
     def create_secret(
         self,
         connector_id: str,
@@ -1207,6 +1380,177 @@ class AsyncConnectorsResource(AsyncAPIResource):
             cast_to=NoneType,
         )
+    @overload
+    async def add_schema_object(
+        self,
+        connector_id: str,
+        *,
+        name: str,
+        type: Literal["database"],
+        description: Optional[str] | Omit = omit,
+        notes: Optional[str] | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> ConnectorAddSchemaObjectResponse:
+        """
+        Args:
+          extra_headers: Send extra headers
+          extra_query: Add additional query parameters to the request
+          extra_body: Add additional JSON properties to the request
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+    @overload
+    async def add_schema_object(
+        self,
+        connector_id: str,
+        *,
+        database_id: str,
+        name: str,
+        type: Literal["schema"],
+        description: Optional[str] | Omit = omit,
+        notes: Optional[str] | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> ConnectorAddSchemaObjectResponse:
+        """
+        Args:
+          extra_headers: Send extra headers
+          extra_query: Add additional query parameters to the request
+          extra_body: Add additional JSON properties to the request
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+    @overload
+    async def add_schema_object(
+        self,
+        connector_id: str,
+        *,
+        name: str,
+        schema_id: str,
+        type: Literal["table"],
+        description: Optional[str] | Omit = omit,
+        endpoint: Optional[str] | Omit = omit,
+        notes: Optional[str] | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> ConnectorAddSchemaObjectResponse:
+        """
+        Args:
+          extra_headers: Send extra headers
+          extra_query: Add additional query parameters to the request
+          extra_body: Add additional JSON properties to the request
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+    @overload
+    async def add_schema_object(
+        self,
+        connector_id: str,
+        *,
+        column_type: str,
+        name: str,
+        table_id: str,
+        type: Literal["column"],
+        notes: Optional[str] | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> ConnectorAddSchemaObjectResponse:
+        """
+        Args:
+          extra_headers: Send extra headers
+          extra_query: Add additional query parameters to the request
+          extra_body: Add additional JSON properties to the request
+          timeout: Override the client-level default timeout for this request, in seconds
+        """
+        ...
+    @required_args(
+        ["name", "type"],
+        ["database_id", "name", "type"],
+        ["name", "schema_id", "type"],
+        ["column_type", "name", "table_id", "type"],
+    )
+    async def add_schema_object(
+        self,
+        connector_id: str,
+        *,
+        name: str,
+        type: Literal["database"] | Literal["schema"] | Literal["table"] | Literal["column"],
+        description: Optional[str] | Omit = omit,
+        notes: Optional[str] | Omit = omit,
+        database_id: str | Omit = omit,
+        schema_id: str | Omit = omit,
+        endpoint: Optional[str] | Omit = omit,
+        column_type: str | Omit = omit,
+        table_id: str | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> ConnectorAddSchemaObjectResponse:
+        if not connector_id:
+            raise ValueError(f"Expected a non-empty value for `connector_id` but received {connector_id!r}")
+        return cast(
+            ConnectorAddSchemaObjectResponse,
+            await self._post(
+                f"/connectors/{connector_id}/schema_object",
+                body=await async_maybe_transform(
+                    {
+                        "name": name,
+                        "type": type,
+                        "description": description,
+                        "notes": notes,
+                        "database_id": database_id,
+                        "schema_id": schema_id,
+                        "endpoint": endpoint,
+                        "column_type": column_type,
+                        "table_id": table_id,
+                    },
+                    connector_add_schema_object_params.ConnectorAddSchemaObjectParams,
+                ),
+                options=make_request_options(
+                    extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
+                ),
+                cast_to=cast(
+                    Any, ConnectorAddSchemaObjectResponse
+                ),  # Union types cannot be passed in as arguments in the type system
+            ),
+        )
     async def create_secret(
         self,
         connector_id: str,
@@ -1951,6 +2295,9 @@ class ConnectorsResourceWithRawResponse:
         self.delete = to_raw_response_wrapper(
             connectors.delete,
         )
+        self.add_schema_object = to_raw_response_wrapper(
+            connectors.add_schema_object,
+        )
         self.create_secret = to_raw_response_wrapper(
             connectors.create_secret,
         )
@@ -2024,6 +2371,9 @@ class AsyncConnectorsResourceWithRawResponse:
         self.delete = async_to_raw_response_wrapper(
             connectors.delete,
         )
+        self.add_schema_object = async_to_raw_response_wrapper(
+            connectors.add_schema_object,
+        )
         self.create_secret = async_to_raw_response_wrapper(
             connectors.create_secret,
         )
@@ -2097,6 +2447,9 @@ class ConnectorsResourceWithStreamingResponse:
         self.delete = to_streamed_response_wrapper(
             connectors.delete,
         )
+        self.add_schema_object = to_streamed_response_wrapper(
+            connectors.add_schema_object,
+        )
         self.create_secret = to_streamed_response_wrapper(
             connectors.create_secret,
         )
@@ -2170,6 +2523,9 @@ class AsyncConnectorsResourceWithStreamingResponse:
         self.delete = async_to_streamed_response_wrapper(
             connectors.delete,
         )
+        self.add_schema_object = async_to_streamed_response_wrapper(
+            connectors.add_schema_object,
+        )
         self.create_secret = async_to_streamed_response_wrapper(
             connectors.create_secret,
         )

structify/resources/polars.py CHANGED Viewed

@@ -16,7 +16,6 @@ from structify.types.entity_param import EntityParam
 from structify.types.property_type_param import PropertyTypeParam
 from structify.types.dataset_create_params import Relationship as CreateRelationshipParam
 from structify.types.knowledge_graph_param import KnowledgeGraphParam
-from structify.types.dataset_view_table_response import Properties
 from ..types import TableParam
 from .._compat import cached_property
@@ -35,6 +34,17 @@ from ..types.structure_run_async_params import SourceWebWeb
 __all__ = ["PolarsResource"]
 MAX_PARALLEL_REQUESTS = 20
+STRUCTIFY_JOB_ID_COLUMN = "structify_job_id"
+def _collect_entities_with_job_ids(entities: Any) -> List[Dict[str, Any]]:
+    """Collect entity properties with their first job_id."""
+    results: List[Dict[str, Any]] = []
+    for entity in entities:
+        row: Dict[str, Any] = dict(entity.properties)
+        row[STRUCTIFY_JOB_ID_COLUMN] = entity.job_ids[0] if entity.job_ids else None
+        results.append(row)
+    return results
 class PolarsResource(SyncAPIResource):
@@ -164,8 +174,9 @@ class PolarsResource(SyncAPIResource):
         # Get the node ID when the function is called, not when the batch is processed
         node_id = get_node_id()
-        # Create the expected output schema
+        # Create the expected output schema with single job_id column
         expected_schema = properties_to_schema(all_properties)
+        expected_schema[STRUCTIFY_JOB_ID_COLUMN] = pl.String
         # Apply Structify enrich on the dataframe
         def enhance_batch(batch_df: pl.DataFrame) -> pl.DataFrame:
@@ -249,11 +260,10 @@ class PolarsResource(SyncAPIResource):
             # 3. Wait for all jobs to complete
             title = f"Enriching {property_names} for {dataframe_name}"
             self._client.jobs.wait_for_jobs(dataset_name=dataset_name, title=title, node_id=node_id)
-            # 4. Collect the results
-            results = [
-                entity.properties
-                for entity in self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name)
-            ]
+            # 4. Collect the results with job_ids
+            results = _collect_entities_with_job_ids(
+                self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name)
+            )
             # 5. Return the results
             return pl.DataFrame(results, schema=expected_schema)
@@ -296,6 +306,7 @@ class PolarsResource(SyncAPIResource):
                 target_columns[col_name] = col_info.get("type", pl.String())
         output_schema = _merge_schema_with_suffix(input_schema, target_columns, suffix=target_table_name)
+        output_schema[STRUCTIFY_JOB_ID_COLUMN] = pl.String
         target_properties: list[Property] = [
             Property(
@@ -412,6 +423,7 @@ class PolarsResource(SyncAPIResource):
                             prop_name if prop_name not in input_schema else f"{prop_name}_{target_table_name}"
                         )  # If the column already exists in the input schema, we need to suffix it with the target table name
                         result_row[eff] = target_entity.properties.get(prop_name)
+                    result_row[STRUCTIFY_JOB_ID_COLUMN] = target_entity.job_ids[0] if target_entity.job_ids else None
                     result_rows.append(result_row)
             # Handle source rows without relationships
@@ -422,6 +434,7 @@ class PolarsResource(SyncAPIResource):
                     for prop_name in target_schema.keys():
                         eff = prop_name if prop_name not in input_schema else f"{prop_name}_{target_table_name}"
                         orphan_row[eff] = None
+                    orphan_row[STRUCTIFY_JOB_ID_COLUMN] = None
                     result_rows.append(orphan_row)
             if not result_rows:
@@ -440,14 +453,11 @@ class PolarsResource(SyncAPIResource):
         dataframe_name: str,
         dataframe_description: str,
         use_proxy: bool = False,
-        include_job_ids: bool = False,
     ) -> LazyFrame:
         """
         Enhance one or more columns of a `LazyFrame` directly from a URL.
-        When `include_job_ids=True`, an additional `job_id` column is added to the
-        output DataFrame with the Structify job id for each URL. The job id is not
-        stored in Structify.
+        Adds a `structify_job_id` column with the job id for each row.
         """
         # Existing columns & their dtypes from the LazyFrame
@@ -475,8 +485,6 @@ class PolarsResource(SyncAPIResource):
             for col_name, (dtype, desc) in new_columns_dict.items()
         ]
-        job_id_column: str | None = "job_id" if include_job_ids else None
         all_properties = merge_column_properties(pre_existing_properties, new_column_properties)
         dataset_name = f"enhance_{dataframe_name}_{uuid.uuid4().hex}"
@@ -504,10 +512,9 @@ class PolarsResource(SyncAPIResource):
         # Get the node ID when the function is called, not when the batch is processed
         node_id = get_node_id()
-        # Create the expected output schema
+        # Create the expected output schema with single job_id column
         expected_schema = properties_to_schema(all_properties)
-        if job_id_column is not None:
-            expected_schema[job_id_column] = pl.String
+        expected_schema[STRUCTIFY_JOB_ID_COLUMN] = pl.String
         # Apply Structify scrape on the dataframe
         def scrape_batch(batch_df: pl.DataFrame) -> pl.DataFrame:
@@ -537,8 +544,6 @@ class PolarsResource(SyncAPIResource):
                     entity_id_to_entity[entity_id] = entity
             # 2. Run scrape jobs for each entity
-            job_ids_by_url: Dict[str, str] = {}
             def scrape_entity_property(entity_id: str) -> None:
                 entity = entity_id_to_entity[entity_id]
                 url = entity["properties"].get(url_column)
@@ -549,7 +554,7 @@ class PolarsResource(SyncAPIResource):
                         f"URL column {url_column} must be of string type, got {type(entity['properties'][url_column])}"
                     )
-                response = self._client.scrape.scrape(
+                self._client.scrape.scrape(
                     dataset_name=dataset_name,
                     extraction_criteria=[
                         RequiredProperty(
@@ -566,8 +571,6 @@ class PolarsResource(SyncAPIResource):
                     use_proxy=use_proxy,
                     url=url,
                 )
-                if job_id_column is not None:
-                    job_ids_by_url[url] = response.job_id
             property_list = list(new_columns_dict.keys())
             if len(property_list) == 1:
@@ -592,17 +595,10 @@ class PolarsResource(SyncAPIResource):
             title = f"Scraping {property_names} for {dataframe_name}"
             self._client.jobs.wait_for_jobs(dataset_name=dataset_name, title=title, node_id=node_id)
-            # 4. Collect the results
-            results: list[dict[str, Properties]] = []
-            for entity in self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name):
-                properties = entity.properties.copy()
-                if job_id_column is not None:
-                    url = properties.get(url_column)
-                    if isinstance(url, str):
-                        job_id = job_ids_by_url.get(url)
-                        if job_id is not None:
-                            properties[job_id_column] = job_id
-                results.append(properties)
+            # 4. Collect the results with job_id
+            results = _collect_entities_with_job_ids(
+                self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name)
+            )
             # 5. Return the results
             return pl.DataFrame(results, schema=expected_schema)
@@ -657,6 +653,7 @@ class PolarsResource(SyncAPIResource):
             }
         output_schema = _merge_schema_with_suffix(input_schema, scraped_columns, suffix=relationship["target_table"])
+        output_schema[STRUCTIFY_JOB_ID_COLUMN] = pl.String
         properties: list[Property] = []
         for col_name, col_info in scrape_schema.items():
@@ -758,6 +755,9 @@ class PolarsResource(SyncAPIResource):
                                 result_row: dict[str, Any] = {
                                     **scraped_entity.properties,
                                     url_column: related_entity.properties[url_column],
+                                    STRUCTIFY_JOB_ID_COLUMN: scraped_entity.job_ids[0]
+                                    if scraped_entity.job_ids
+                                    else None,
                                 }
                                 result_rows.append(result_row)
                     offset += LIMIT
@@ -765,8 +765,11 @@ class PolarsResource(SyncAPIResource):
                         break
                 except Exception:
                     break
-            # Build scraped schema (pre-join, original names) incl. join column
-            scraped_schema = scraped_columns | {url_column: input_schema[url_column]}
+            # Build scraped schema (pre-join, original names) incl. join column and job_id
+            scraped_schema: Dict[str, pl.DataType] = scraped_columns | {
+                url_column: input_schema[url_column],
+                STRUCTIFY_JOB_ID_COLUMN: pl.String(),
+            }
             # Fill missing columns in scraped results
             for result_row in result_rows:
@@ -839,6 +842,7 @@ class PolarsResource(SyncAPIResource):
         polars_schema = pl.Schema(
             [(path_column, pl.String())]
             + [(col_name, col_info.get("type", pl.String())) for col_name, col_info in schema.items()]
+            + [(STRUCTIFY_JOB_ID_COLUMN, pl.String())]
         )
         assert path_column in document_paths.collect_schema(), (
@@ -931,9 +935,15 @@ class PolarsResource(SyncAPIResource):
         # Get all of the entities with their job_ids
         entities = self._client.datasets.view_table(dataset=dataset_name, name=table_name)
-        structured_results: List[Dict[str, Any]] = [
-            {**entity.properties, path_column: job_to_pdf_path[entity.job_ids[0]]} for entity in entities
-        ]
+        structured_results: List[Dict[str, Any]] = []
+        for entity in entities:
+            job_id = entity.job_ids[0] if entity.job_ids else None
+            result_row: Dict[str, Any] = {
+                **entity.properties,
+                path_column: job_to_pdf_path.get(job_id) if job_id else None,
+                STRUCTIFY_JOB_ID_COLUMN: job_id,
+            }
+            structured_results.append(result_row)
         # Ensure all columns are present with None for missing values
         for result_row in structured_results:
@@ -986,6 +996,7 @@ class PolarsResource(SyncAPIResource):
         all_properties = existing_properties + [new_property]
         expected_schema = properties_to_schema(all_properties)
+        expected_schema[STRUCTIFY_JOB_ID_COLUMN] = pl.String
         if collected_df.is_empty():
             return pl.DataFrame(schema=expected_schema).lazy()
@@ -1024,12 +1035,12 @@ class PolarsResource(SyncAPIResource):
             node_id=node_id,
         )
-        # 3. Collect the results
+        # 3. Collect the results with job_ids
         title = f"Tagging {new_property_name} for {dataframe_name}"
         self._client.jobs.wait_for_jobs(dataset_name=dataset_name, title=title, node_id=node_id)
-        results = [
-            entity.properties for entity in self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name)
-        ]
+        results = _collect_entities_with_job_ids(
+            self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name)
+        )
         # 4. Return the results
         return pl.DataFrame(results, schema=expected_schema).lazy()
@@ -1157,6 +1168,7 @@ class PolarsResource(SyncAPIResource):
                 "idx1": [match.target_entity_index for match in matches],
                 "idx2": [match.source_entity_index for match in matches],
                 "match_reason": [match.match_reason for match in matches],
+                STRUCTIFY_JOB_ID_COLUMN: [match.job_id for match in matches],
             }
         else:
             # No swap, return as normal
@@ -1164,6 +1176,7 @@ class PolarsResource(SyncAPIResource):
                 "idx1": [match.source_entity_index for match in matches],
                 "idx2": [match.target_entity_index for match in matches],
                 "match_reason": [match.match_reason for match in matches],
+                STRUCTIFY_JOB_ID_COLUMN: [match.job_id for match in matches],
             }
         return pl.DataFrame(matches_in_schema).lazy()
@@ -1182,7 +1195,7 @@ class PolarsResource(SyncAPIResource):
             "/entity/upload_parquet",
             params={"dataset": dataset_name, "table_name": table_name},
             files={"file": ("data.parquet", parquet_bytes.getvalue(), "application/octet-stream")},
-            headers={"Authorization": f"Bearer {self._client.session_token}"},
+            headers=self._client.auth_headers,
         )
         response.raise_for_status()

structifyai 1.178.0__py3-none-any.whl → 1.180.0__py3-none-any.whl

structifyai 1.178.0py3-none-any.whl → 1.180.0py3-none-any.whl