PyPI - structifyai - Versions diffs - 1.178.0__py3-none-any.whl → 1.179.0__py3-none-any.whl - Mend

structifyai 1.178.0py3-none-any.whl → 1.179.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

structify/_version.py CHANGED Viewed

@@ -1,4 +1,4 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 __title__ = "structify"
-__version__ = "1.178.0"  # x-release-please-version
+__version__ = "1.179.0"  # x-release-please-version

structify/resources/polars.py CHANGED Viewed

@@ -16,7 +16,6 @@ from structify.types.entity_param import EntityParam
 from structify.types.property_type_param import PropertyTypeParam
 from structify.types.dataset_create_params import Relationship as CreateRelationshipParam
 from structify.types.knowledge_graph_param import KnowledgeGraphParam
-from structify.types.dataset_view_table_response import Properties
 from ..types import TableParam
 from .._compat import cached_property
@@ -35,6 +34,17 @@ from ..types.structure_run_async_params import SourceWebWeb
 __all__ = ["PolarsResource"]
 MAX_PARALLEL_REQUESTS = 20
+STRUCTIFY_JOB_ID_COLUMN = "structify_job_id"
+def _collect_entities_with_job_ids(entities: Any) -> List[Dict[str, Any]]:
+    """Collect entity properties with their first job_id."""
+    results: List[Dict[str, Any]] = []
+    for entity in entities:
+        row: Dict[str, Any] = dict(entity.properties)
+        row[STRUCTIFY_JOB_ID_COLUMN] = entity.job_ids[0] if entity.job_ids else None
+        results.append(row)
+    return results
 class PolarsResource(SyncAPIResource):
@@ -164,8 +174,9 @@ class PolarsResource(SyncAPIResource):
         # Get the node ID when the function is called, not when the batch is processed
         node_id = get_node_id()
-        # Create the expected output schema
+        # Create the expected output schema with single job_id column
         expected_schema = properties_to_schema(all_properties)
+        expected_schema[STRUCTIFY_JOB_ID_COLUMN] = pl.String
         # Apply Structify enrich on the dataframe
         def enhance_batch(batch_df: pl.DataFrame) -> pl.DataFrame:
@@ -249,11 +260,10 @@ class PolarsResource(SyncAPIResource):
             # 3. Wait for all jobs to complete
             title = f"Enriching {property_names} for {dataframe_name}"
             self._client.jobs.wait_for_jobs(dataset_name=dataset_name, title=title, node_id=node_id)
-            # 4. Collect the results
-            results = [
-                entity.properties
-                for entity in self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name)
-            ]
+            # 4. Collect the results with job_ids
+            results = _collect_entities_with_job_ids(
+                self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name)
+            )
             # 5. Return the results
             return pl.DataFrame(results, schema=expected_schema)
@@ -296,6 +306,7 @@ class PolarsResource(SyncAPIResource):
                 target_columns[col_name] = col_info.get("type", pl.String())
         output_schema = _merge_schema_with_suffix(input_schema, target_columns, suffix=target_table_name)
+        output_schema[STRUCTIFY_JOB_ID_COLUMN] = pl.String
         target_properties: list[Property] = [
             Property(
@@ -412,6 +423,7 @@ class PolarsResource(SyncAPIResource):
                             prop_name if prop_name not in input_schema else f"{prop_name}_{target_table_name}"
                         )  # If the column already exists in the input schema, we need to suffix it with the target table name
                         result_row[eff] = target_entity.properties.get(prop_name)
+                    result_row[STRUCTIFY_JOB_ID_COLUMN] = target_entity.job_ids[0] if target_entity.job_ids else None
                     result_rows.append(result_row)
             # Handle source rows without relationships
@@ -422,6 +434,7 @@ class PolarsResource(SyncAPIResource):
                     for prop_name in target_schema.keys():
                         eff = prop_name if prop_name not in input_schema else f"{prop_name}_{target_table_name}"
                         orphan_row[eff] = None
+                    orphan_row[STRUCTIFY_JOB_ID_COLUMN] = None
                     result_rows.append(orphan_row)
             if not result_rows:
@@ -440,14 +453,11 @@ class PolarsResource(SyncAPIResource):
         dataframe_name: str,
         dataframe_description: str,
         use_proxy: bool = False,
-        include_job_ids: bool = False,
     ) -> LazyFrame:
         """
         Enhance one or more columns of a `LazyFrame` directly from a URL.
-        When `include_job_ids=True`, an additional `job_id` column is added to the
-        output DataFrame with the Structify job id for each URL. The job id is not
-        stored in Structify.
+        Adds a `structify_job_id` column with the job id for each row.
         """
         # Existing columns & their dtypes from the LazyFrame
@@ -475,8 +485,6 @@ class PolarsResource(SyncAPIResource):
             for col_name, (dtype, desc) in new_columns_dict.items()
         ]
-        job_id_column: str | None = "job_id" if include_job_ids else None
         all_properties = merge_column_properties(pre_existing_properties, new_column_properties)
         dataset_name = f"enhance_{dataframe_name}_{uuid.uuid4().hex}"
@@ -504,10 +512,9 @@ class PolarsResource(SyncAPIResource):
         # Get the node ID when the function is called, not when the batch is processed
         node_id = get_node_id()
-        # Create the expected output schema
+        # Create the expected output schema with single job_id column
         expected_schema = properties_to_schema(all_properties)
-        if job_id_column is not None:
-            expected_schema[job_id_column] = pl.String
+        expected_schema[STRUCTIFY_JOB_ID_COLUMN] = pl.String
         # Apply Structify scrape on the dataframe
         def scrape_batch(batch_df: pl.DataFrame) -> pl.DataFrame:
@@ -537,8 +544,6 @@ class PolarsResource(SyncAPIResource):
                     entity_id_to_entity[entity_id] = entity
             # 2. Run scrape jobs for each entity
-            job_ids_by_url: Dict[str, str] = {}
             def scrape_entity_property(entity_id: str) -> None:
                 entity = entity_id_to_entity[entity_id]
                 url = entity["properties"].get(url_column)
@@ -549,7 +554,7 @@ class PolarsResource(SyncAPIResource):
                         f"URL column {url_column} must be of string type, got {type(entity['properties'][url_column])}"
                     )
-                response = self._client.scrape.scrape(
+                self._client.scrape.scrape(
                     dataset_name=dataset_name,
                     extraction_criteria=[
                         RequiredProperty(
@@ -566,8 +571,6 @@ class PolarsResource(SyncAPIResource):
                     use_proxy=use_proxy,
                     url=url,
                 )
-                if job_id_column is not None:
-                    job_ids_by_url[url] = response.job_id
             property_list = list(new_columns_dict.keys())
             if len(property_list) == 1:
@@ -592,17 +595,10 @@ class PolarsResource(SyncAPIResource):
             title = f"Scraping {property_names} for {dataframe_name}"
             self._client.jobs.wait_for_jobs(dataset_name=dataset_name, title=title, node_id=node_id)
-            # 4. Collect the results
-            results: list[dict[str, Properties]] = []
-            for entity in self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name):
-                properties = entity.properties.copy()
-                if job_id_column is not None:
-                    url = properties.get(url_column)
-                    if isinstance(url, str):
-                        job_id = job_ids_by_url.get(url)
-                        if job_id is not None:
-                            properties[job_id_column] = job_id
-                results.append(properties)
+            # 4. Collect the results with job_id
+            results = _collect_entities_with_job_ids(
+                self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name)
+            )
             # 5. Return the results
             return pl.DataFrame(results, schema=expected_schema)
@@ -657,6 +653,7 @@ class PolarsResource(SyncAPIResource):
             }
         output_schema = _merge_schema_with_suffix(input_schema, scraped_columns, suffix=relationship["target_table"])
+        output_schema[STRUCTIFY_JOB_ID_COLUMN] = pl.String
         properties: list[Property] = []
         for col_name, col_info in scrape_schema.items():
@@ -758,6 +755,9 @@ class PolarsResource(SyncAPIResource):
                                 result_row: dict[str, Any] = {
                                     **scraped_entity.properties,
                                     url_column: related_entity.properties[url_column],
+                                    STRUCTIFY_JOB_ID_COLUMN: scraped_entity.job_ids[0]
+                                    if scraped_entity.job_ids
+                                    else None,
                                 }
                                 result_rows.append(result_row)
                     offset += LIMIT
@@ -765,8 +765,11 @@ class PolarsResource(SyncAPIResource):
                         break
                 except Exception:
                     break
-            # Build scraped schema (pre-join, original names) incl. join column
-            scraped_schema = scraped_columns | {url_column: input_schema[url_column]}
+            # Build scraped schema (pre-join, original names) incl. join column and job_id
+            scraped_schema: Dict[str, pl.DataType] = scraped_columns | {
+                url_column: input_schema[url_column],
+                STRUCTIFY_JOB_ID_COLUMN: pl.String(),
+            }
             # Fill missing columns in scraped results
             for result_row in result_rows:
@@ -839,6 +842,7 @@ class PolarsResource(SyncAPIResource):
         polars_schema = pl.Schema(
             [(path_column, pl.String())]
             + [(col_name, col_info.get("type", pl.String())) for col_name, col_info in schema.items()]
+            + [(STRUCTIFY_JOB_ID_COLUMN, pl.String())]
         )
         assert path_column in document_paths.collect_schema(), (
@@ -931,9 +935,15 @@ class PolarsResource(SyncAPIResource):
         # Get all of the entities with their job_ids
         entities = self._client.datasets.view_table(dataset=dataset_name, name=table_name)
-        structured_results: List[Dict[str, Any]] = [
-            {**entity.properties, path_column: job_to_pdf_path[entity.job_ids[0]]} for entity in entities
-        ]
+        structured_results: List[Dict[str, Any]] = []
+        for entity in entities:
+            job_id = entity.job_ids[0] if entity.job_ids else None
+            result_row: Dict[str, Any] = {
+                **entity.properties,
+                path_column: job_to_pdf_path.get(job_id) if job_id else None,
+                STRUCTIFY_JOB_ID_COLUMN: job_id,
+            }
+            structured_results.append(result_row)
         # Ensure all columns are present with None for missing values
         for result_row in structured_results:
@@ -986,6 +996,7 @@ class PolarsResource(SyncAPIResource):
         all_properties = existing_properties + [new_property]
         expected_schema = properties_to_schema(all_properties)
+        expected_schema[STRUCTIFY_JOB_ID_COLUMN] = pl.String
         if collected_df.is_empty():
             return pl.DataFrame(schema=expected_schema).lazy()
@@ -1024,12 +1035,12 @@ class PolarsResource(SyncAPIResource):
             node_id=node_id,
         )
-        # 3. Collect the results
+        # 3. Collect the results with job_ids
         title = f"Tagging {new_property_name} for {dataframe_name}"
         self._client.jobs.wait_for_jobs(dataset_name=dataset_name, title=title, node_id=node_id)
-        results = [
-            entity.properties for entity in self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name)
-        ]
+        results = _collect_entities_with_job_ids(
+            self._client.datasets.view_table(dataset=dataset_name, name=dataframe_name)
+        )
         # 4. Return the results
         return pl.DataFrame(results, schema=expected_schema).lazy()
@@ -1157,6 +1168,7 @@ class PolarsResource(SyncAPIResource):
                 "idx1": [match.target_entity_index for match in matches],
                 "idx2": [match.source_entity_index for match in matches],
                 "match_reason": [match.match_reason for match in matches],
+                STRUCTIFY_JOB_ID_COLUMN: [match.job_id for match in matches],
             }
         else:
             # No swap, return as normal
@@ -1164,6 +1176,7 @@ class PolarsResource(SyncAPIResource):
                 "idx1": [match.source_entity_index for match in matches],
                 "idx2": [match.target_entity_index for match in matches],
                 "match_reason": [match.match_reason for match in matches],
+                STRUCTIFY_JOB_ID_COLUMN: [match.job_id for match in matches],
             }
         return pl.DataFrame(matches_in_schema).lazy()
@@ -1182,7 +1195,7 @@ class PolarsResource(SyncAPIResource):
             "/entity/upload_parquet",
             params={"dataset": dataset_name, "table_name": table_name},
             files={"file": ("data.parquet", parquet_bytes.getvalue(), "application/octet-stream")},
-            headers={"Authorization": f"Bearer {self._client.session_token}"},
+            headers=self._client.auth_headers,
         )
         response.raise_for_status()

{structifyai-1.178.0.dist-info → structifyai-1.179.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: structifyai
-Version: 1.178.0
+Version: 1.179.0
 Summary: The official Python library for the structify API
 Project-URL: Homepage, https://github.com/StructifyAI/structify-python
 Project-URL: Repository, https://github.com/StructifyAI/structify-python

{structifyai-1.178.0.dist-info → structifyai-1.179.0.dist-info}/RECORD RENAMED Viewed

@@ -11,7 +11,7 @@ structify/_resource.py,sha256=tJi4pDQooQZ_zJwEwrLj-U-ye2hC-cbmr1GzIwCT10Y,1118
 structify/_response.py,sha256=RuNhMDiZUdPqEbmFJHDVI4FMPDszk8QjK9LVWm1Fagk,28806
 structify/_streaming.py,sha256=n4C9M7ITmANYn9LaWHNoqJdIIyF7svLco2qst7u3M7U,10233
 structify/_types.py,sha256=jj4p-m3vpUma0AdhPWIaljHZXeb4RKnrAusjVdpDy5Y,7597
-structify/_version.py,sha256=oPHKgofoBsX3XsUo0vn74-C4ADz2Aq_wNd3D_a2VRiI,163
+structify/_version.py,sha256=wgYCQOamvnt8XFcVc7rp8aKcLrFsK_GWWmssGhTAKiE,163
 structify/pagination.py,sha256=ycybhWcpKk4ztsMcCA6C0WZiJejGrSx6bSr8LLskJUY,4346
 structify/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 structify/_utils/__init__.py,sha256=7fch0GT9zpNnErbciSpUNa-SjTxxjY6kxHxKMOM4AGs,2305
@@ -39,7 +39,7 @@ structify/resources/external_dataframe_proxy.py,sha256=DSn0YwWIembR__ZtDxVCJtyY3
 structify/resources/jobs.py,sha256=gO1aSByi1dMvW90UDsMmNhLHFCOY4ENLkZcAx4gbLHY,30108
 structify/resources/match.py,sha256=gDWEWnKwEoLbheQAMFltJCk2ysa_L9AuJMOaauM7c4Y,12248
 structify/resources/nango.py,sha256=Zl0M1XhlVe61jHVd-SdYI9uEbEhIRmskvlk7Xp0Lh8g,9166
-structify/resources/polars.py,sha256=W7m8A-q8mdz5-hb3pqcO9q93j5ttMFCGJlEfGWDBFH0,60046
+structify/resources/polars.py,sha256=9uluqVdLXu5ZpR9-v7B6i3TJVV47zKtILOJwAzz5lkU,60817
 structify/resources/projects.py,sha256=YDikBDB9D1EXyZ2GyRx4GlpQ83snw51YlNuU1sLHqho,14117
 structify/resources/public_sessions.py,sha256=_JmssE0MMjeGdxT0FWtrkcceSV4skeEkVGYeO2FkJes,9976
 structify/resources/sandbox.py,sha256=KgpZ623G6T_3_oovCgjlxO81M63NanMBAezVDdesOCc,12807
@@ -496,7 +496,7 @@ structify/types/user/stripe_create_portal_session_params.py,sha256=5AYRC8z_SlKmd
 structify/types/user/stripe_create_session_params.py,sha256=DFcNLNzEWeupkGQ9J5PafsuL_bIU9cLEIhAmFPsRlfo,387
 structify/types/user/stripe_create_subscription_params.py,sha256=d8HfiC94gJbG-cC_WvBz6xYCvxKJO_EP2yyVmVvufrU,424
 structify/types/user/subscription_plan.py,sha256=qKJMM-zPpYolYC1DlypOwPpxlyJBLkQqFK_0VpwktJs,222
-structifyai-1.178.0.dist-info/METADATA,sha256=YR3Xd2scGjvgDi48CePU9hhyhXkmwtXYDSxiuMoNrDQ,16399
-structifyai-1.178.0.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
-structifyai-1.178.0.dist-info/licenses/LICENSE,sha256=9CwgrmGz3rZSTT-KqGc1gua-7g8B4ThTgMtUgPALh5c,11339
-structifyai-1.178.0.dist-info/RECORD,,
+structifyai-1.179.0.dist-info/METADATA,sha256=ZntSnSVPD0ghtVBXKP4ejLLRTP16yvtcTVfH2QFZCPw,16399
+structifyai-1.179.0.dist-info/WHEEL,sha256=C2FUgwZgiLbznR-k0b_5k3Ai_1aASOXDss3lzCUsUug,87
+structifyai-1.179.0.dist-info/licenses/LICENSE,sha256=9CwgrmGz3rZSTT-KqGc1gua-7g8B4ThTgMtUgPALh5c,11339
+structifyai-1.179.0.dist-info/RECORD,,

{structifyai-1.178.0.dist-info → structifyai-1.179.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{structifyai-1.178.0.dist-info → structifyai-1.179.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

structifyai 1.178.0__py3-none-any.whl → 1.179.0__py3-none-any.whl

structifyai 1.178.0py3-none-any.whl → 1.179.0py3-none-any.whl