PyPI - classifyre-cli - Versions diffs - 0.4.29__tar.gz → 0.4.31__tar.gz - Mend

classifyre-cli 0.4.29tar.gz → 0.4.31tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (209) hide show

classifyre_cli-0.4.31/.turbo/turbo-build.log ADDED Viewed

@@ -0,0 +1,3 @@
+$ uv sync
+Resolved 265 packages in 164ms
+Checked 50 packages in 1ms

{classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: classifyre-cli
-Version: 0.4.29
+Version: 0.4.31
 Summary: Classifyre CLI — scan and classify unstructured data sources
 License: MIT
 Keywords: data,ingestion,metadata,pii,secrets,unstructured

{classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/package.json RENAMED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@classifyre/cli",
-  "version": "0.4.29",
+  "version": "0.4.31",
   "private": true,
   "scripts": {
     "build": "uv sync",

{classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "classifyre-cli"
-version = "0.4.29"
+version = "0.4.31"
 description = "Classifyre CLI — scan and classify unstructured data sources"
 readme = "README.md"
 requires-python = ">=3.12"

{classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/main.py RENAMED Viewed

@@ -366,6 +366,12 @@ async def run_command_async(args: argparse.Namespace, recipe: dict[str, Any]) ->
                             len(all_stubs),
                         )
+                    # Persist the advanced AUTOMATIC sampling cursor (no-op for
+                    # other strategies, which return None). Only on the normal
+                    # completion path — a timed-out run must not advance it.
+                    if hasattr(sink, "set_sampling_cursor"):
+                        sink.set_sampling_cursor(source.current_sampling_cursor())
                     await sink.finish()
                     logger.info(
                         "Extraction completed: %s assets in %s batches",

{classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/models/generated_input.py RENAMED Viewed

@@ -85,9 +85,10 @@ class SlackChannelType(StrEnum):
 class SamplingStrategy(StrEnum):
     """
-    Sampling strategy: RANDOM samples items randomly, LATEST prioritises the most recently modified/created items, ALL scans every item with no limit
+    Sampling strategy. AUTOMATIC (recommended default) incrementally ingests a new slice of not-yet-seen data on every run, remembering its position between runs and wrapping around to re-scan from the start once everything has been covered — eventually ingesting everything at a bounded cost per run. RANDOM samples items randomly. LATEST prioritises the most recently modified/created items. ALL scans every item with no limit.
     """
+    AUTOMATIC = 'AUTOMATIC'
     RANDOM = 'RANDOM'
     LATEST = 'LATEST'
     ALL = 'ALL'
@@ -95,7 +96,7 @@ class SamplingStrategy(StrEnum):
 class SamplingConfig(BaseModel):
     """
-    Controls how content is extracted from each source. For tabular sources rows_per_page controls both sample size for RANDOM/LATEST and pagination batch size for ALL.
+    Controls how content is extracted from each source. For tabular sources rows_per_page controls both sample size for AUTOMATIC/RANDOM/LATEST and pagination batch size for ALL.
     """
     model_config = ConfigDict(
@@ -124,7 +125,7 @@ class SamplingConfig(BaseModel):
     )
     rows_per_page: int | None = Field(
         100,
-        description='Tabular sources only. Number of rows per sample (RANDOM/LATEST) or per pagination batch (ALL). Controls memory usage during large table scans.',
+        description='Tabular sources only. Number of rows per sample (AUTOMATIC/RANDOM/LATEST) or per pagination batch (ALL). For AUTOMATIC this is the size of the incremental slice ingested each run. Controls memory usage during large table scans.',
         ge=10,
         le=10000,
     )

{classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/outputs/rest.py RENAMED Viewed

@@ -127,6 +127,11 @@ class FinalizeIngestRunRequest(BaseModel):
     runner_id: str = Field(serialization_alias="runnerId")
     seen_hashes: list[str] = Field(serialization_alias="seenHashes")
+    # AUTOMATIC sampling cursor to persist on the source for the next run.
+    # Omitted (None) for other strategies so the stored cursor is left untouched.
+    sampling_cursor: dict[str, Any] | None = Field(
+        None, serialization_alias="samplingCursor"
+    )
 class UpdateRunnerStatusRequest(BaseModel):
@@ -165,6 +170,11 @@ class RestOutputSink:
         self.session.mount("https://", adapter)
         self._runner_id = context.runner_id
         self._seen_hashes: set[str] = set()
+        self._sampling_cursor: dict[str, Any] | None = None
+    def set_sampling_cursor(self, cursor: dict[str, Any] | None) -> None:
+        """Record the AUTOMATIC sampling cursor to persist on finalize."""
+        self._sampling_cursor = cursor
     async def start(self) -> None:
         if not self.context.source_id:
@@ -244,11 +254,12 @@ class RestOutputSink:
         payload = FinalizeIngestRunRequest(
             runner_id=runner_id,
             seen_hashes=sorted(self._seen_hashes),
+            sampling_cursor=self._sampling_cursor,
         )
         self._request_json(
             "POST",
             f"/sources/{source_id}/assets/finalize",
-            payload.model_dump(mode="json", by_alias=True),
+            payload.model_dump(mode="json", by_alias=True, exclude_none=True),
         )
         status_payload = UpdateRunnerStatusRequest(status="COMPLETED")

{classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/base.py RENAMED Viewed

@@ -1,7 +1,11 @@
+import base64
+import json
+import logging
 import os
+import threading
 from abc import ABC, abstractmethod
 from collections.abc import AsyncGenerator, Generator
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, TypeVar
 from ..models.generated_single_asset_scan_results import DetectionResult, SingleAssetScanResults
 from ..outputs.rest import IngestEdge
@@ -12,6 +16,10 @@ from ..utils.hashing import calculate_checksum, normalize_http_url
 from ..utils.validation import validate_output
 from .recipe_normalizer import normalize_source_recipe
+logger = logging.getLogger(__name__)
+_T = TypeVar("_T")
 class BaseSource(ABC):
     """
@@ -26,6 +34,10 @@ class BaseSource(ABC):
     # Default batch size for streaming asset results
     BATCH_SIZE: int = 50
     HAS_SUCCESSFUL_RUN_ENV = "CLASSIFYRE_SOURCE_HAS_SUCCESSFUL_RUN"
+    # The API injects the saved AUTOMATIC sampling cursor here (base64-encoded
+    # JSON) before launching the CLI job. The recipe itself cannot carry it
+    # because every source schema sets ``additionalProperties: false``.
+    SAMPLING_CURSOR_ENV = "CLASSIFYRE_SAMPLING_CURSOR"
     def __init__(
         self,
@@ -42,6 +54,11 @@ class BaseSource(ABC):
             runner_id: Optional runner ID (for API runs)
         """
         normalized_recipe = normalize_source_recipe(recipe, recipe.get("type"))
+        # Cursor carried over from the previous run (AUTOMATIC strategy). Read
+        # before the override hook so subclasses can consult it there if needed.
+        self._sampling_cursor: dict[str, Any] = self._load_sampling_cursor()
+        self._next_sampling_cursor: dict[str, Any] | None = None
+        self._sampling_cursor_lock = threading.Lock()
         self._apply_initial_sampling_override(normalized_recipe)
         recipe.clear()
         recipe.update(normalized_recipe)
@@ -55,6 +72,108 @@ class BaseSource(ABC):
     def _apply_initial_sampling_override(self, recipe: dict[str, Any]) -> None:
         pass
+    # ── AUTOMATIC sampling cursor ────────────────────────────────────────
+    #
+    # AUTOMATIC sampling keeps a small, opaque, source-defined cursor in the
+    # API between runs. Each run reads the prior cursor (``sampling_cursor``),
+    # ingests the next slice of not-yet-seen data, then records the advanced
+    # cursor (``set_next_sampling_cursor``). The output sink persists it back to
+    # the API on finalize via ``current_sampling_cursor``. When a source has
+    # ingested everything it should reset the cursor so the next run wraps
+    # around and re-ingests from the start (data is not stale).
+    def _load_sampling_cursor(self) -> dict[str, Any]:
+        raw = os.environ.get(self.SAMPLING_CURSOR_ENV)
+        if not raw:
+            return {}
+        try:
+            decoded = base64.b64decode(raw).decode("utf-8")
+            data = json.loads(decoded)
+        except Exception as exc:
+            logger.warning("Ignoring malformed %s: %s", self.SAMPLING_CURSOR_ENV, exc)
+            return {}
+        return data if isinstance(data, dict) else {}
+    def sampling_cursor(self) -> dict[str, Any]:
+        """Return the cursor saved by the previous run (empty on first run)."""
+        return self._sampling_cursor
+    def set_next_sampling_cursor(self, cursor: dict[str, Any]) -> None:
+        """Record the advanced cursor to persist at the end of this run."""
+        self._next_sampling_cursor = cursor
+    def current_sampling_cursor(self) -> dict[str, Any] | None:
+        """Cursor to persist for the next run, or None to leave it unchanged.
+        Returns None unless this run advanced the cursor (i.e. AUTOMATIC
+        sampling actually ran), so non-AUTOMATIC runs never touch the stored
+        cursor.
+        """
+        return self._next_sampling_cursor
+    def sampling_window_size(self, default: int = 100) -> int:
+        """The per-run AUTOMATIC slice size (``rows_per_page``)."""
+        config = getattr(self, "config", None)
+        sampling = getattr(config, "sampling", None) if config is not None else None
+        size = getattr(sampling, "rows_per_page", None)
+        try:
+            return int(size) if size else default
+        except (TypeError, ValueError):
+            return default
+    def _record_cursor_key(self, key: str, value: Any) -> None:
+        """Thread-safely set ``key`` in the cursor to persist for the next run."""
+        with self._sampling_cursor_lock:
+            nxt = self._next_sampling_cursor if isinstance(self._next_sampling_cursor, dict) else {}
+            nxt = {**nxt, key: value}
+            self._next_sampling_cursor = nxt
+    def automatic_offset(self, key: str) -> int:
+        """Return the saved offset for a keyed AUTOMATIC DB cursor (0 on first run)."""
+        saved = self._sampling_cursor.get(key)
+        return saved if isinstance(saved, int) and saved >= 0 else 0
+    def record_automatic_offset(
+        self, key: str, *, prev_offset: int, fetched: int
+    ) -> None:
+        """Advance a keyed offset cursor; wrap to 0 once a page underfills.
+        Used by sources that page rows directly from the backing store
+        (``skip``/``OFFSET``) rather than materialising a full list.
+        """
+        size = self.sampling_window_size()
+        next_offset = 0 if fetched < size else prev_offset + fetched
+        self._record_cursor_key(key, next_offset)
+    def automatic_window(self, items: list[_T], *, key: str = "items") -> list[_T]:
+        """Return the next AUTOMATIC slice of a stably-ordered in-memory list.
+        Non-tabular sources fetch a list of item references, then call this to
+        ingest only the next ``rows_per_page`` window. A per-``key`` offset is
+        remembered between runs and wraps back to the start once the list has
+        been fully covered (data is not stale, so re-ingesting is desired).
+        Callers must pass the items in a **stable order** across runs (e.g. by
+        id or timestamp) so the cursor stays meaningful.
+        """
+        total = len(items)
+        if total == 0:
+            return []
+        saved = self._sampling_cursor.get(key)
+        offset = saved if isinstance(saved, int) and 0 <= saved < total else 0
+        size = self.sampling_window_size()
+        window = items[offset : offset + size]
+        next_offset = offset + len(window)
+        if next_offset >= total:
+            next_offset = 0  # wrap around on the next run
+        self._record_cursor_key(key, next_offset)
+        return window
     @staticmethod
     def _read_bool_env(name: str) -> bool | None:
         raw = os.environ.get(name)

{classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/confluence/source.py RENAMED Viewed

@@ -242,11 +242,24 @@ class ConfluenceSource(BaseSource):
                 params["labels"] = ",".join(str(v) for v in spaces_filter.labels)
         return self.client.iter_confluence_results("/wiki/api/v2/spaces", params=params)
+    def _sorted_page_refs(self, refs: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        return sorted(
+            refs,
+            key=lambda ref: parse_datetime(
+                str(ref.get("version_created_at") or ref.get("created_at") or "")
+            ),
+            reverse=True,
+        )
     def _sample_page_refs(self, refs: list[dict[str, Any]]) -> list[dict[str, Any]]:
         sampling = self.config.sampling
         if sampling.strategy == SamplingStrategy.ALL:
             return refs
+        if sampling.strategy == SamplingStrategy.AUTOMATIC:
+            # Newest-first stable order; window advances each run and wraps around.
+            return self.automatic_window(self._sorted_page_refs(refs), key="pages")
         limit = int(sampling.rows_per_page or 100)
         if limit >= len(refs):
             return refs
@@ -254,14 +267,7 @@ class ConfluenceSource(BaseSource):
         if sampling.strategy == SamplingStrategy.RANDOM:
             return deterministic_sample(refs, limit)
-        refs_sorted = sorted(
-            refs,
-            key=lambda ref: parse_datetime(
-                str(ref.get("version_created_at") or ref.get("created_at") or "")
-            ),
-            reverse=True,
-        )
-        return refs_sorted[:limit]
+        return self._sorted_page_refs(refs)[:limit]
     def _extract_page_assets(self, ref: dict[str, Any]) -> list[SingleAssetScanResults]:
         page_id = str(ref["page_id"])

{classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/databricks/source.py RENAMED Viewed

@@ -423,6 +423,11 @@ class DatabricksSource(BaseTabularSource):
             return value.isoformat()
         return str(value)
+    def _automatic_supports_keyset(self) -> bool:
+        # Databricks builds inline (parameter-less) queries; AUTOMATIC uses OFFSET
+        # paging through _fetch_one_page rather than keyset WHERE clauses.
+        return False
     # ── Databricks pagination (inline LIMIT/OFFSET) ──────────────────────
     def _fetch_one_page(

{classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/email/source.py RENAMED Viewed

@@ -195,6 +195,11 @@ class EmailSource(BaseSource):
         total = 0
         try:
+            if strategy == SamplingStrategy.AUTOMATIC:
+                async for batch in self._extract_automatic(mod, criteria):
+                    yield batch
+                return
             for folder in self.folders:
                 if self._aborted or (limit is not None and total >= limit):
                     break
@@ -236,6 +241,63 @@ class EmailSource(BaseSource):
         finally:
             logger.info("Extracted %s email messages", total)
+    async def _extract_automatic(
+        self, mod: Any, criteria: Any
+    ) -> AsyncGenerator[list[SingleAssetScanResults], None]:
+        """AUTOMATIC sampling: page through each folder's messages by UID.
+        Listing UIDs is cheap (no body fetch); we window the UID list (newest
+        first) so each run ingests the next ``rows_per_page`` slice per folder
+        and wraps around once the folder has been fully covered.
+        """
+        pending: list[SingleAssetScanResults] = []
+        total = 0
+        for folder in self.folders:
+            if self._aborted:
+                break
+            try:
+                self._mailbox.folder.set(folder)
+            except Exception as e:
+                logger.warning("Skipping folder %s: %s", folder, e)
+                continue
+            try:
+                uid_ints = sorted((int(u) for u in self._mailbox.uids(criteria)), reverse=True)
+            except Exception as e:
+                logger.warning("Could not list UIDs for folder %s: %s", folder, e)
+                continue
+            if not uid_ints:
+                continue
+            window = self.automatic_window([str(u) for u in uid_ints], key=f"folder:{folder}")
+            if not window:
+                continue
+            for msg in self._mailbox.fetch(
+                mod.AND(uid=",".join(window)),
+                mark_seen=False,
+                bulk=self.BATCH_SIZE,
+            ):
+                if self._aborted:
+                    break
+                try:
+                    assets = self._message_to_assets(msg, folder)
+                except Exception as e:
+                    logger.error(
+                        "Failed to transform message uid=%s: %s", getattr(msg, "uid", "?"), e
+                    )
+                    continue
+                for asset in assets:
+                    pending.append(asset)
+                    while len(pending) >= self.BATCH_SIZE:
+                        yield pending[: self.BATCH_SIZE]
+                        pending = pending[self.BATCH_SIZE :]
+                total += 1
+        if pending:
+            yield pending
+        logger.info("Extracted %s email messages (AUTOMATIC)", total)
     def _message_to_assets(self, msg: Any, folder: str) -> list[SingleAssetScanResults]:
         message_id = self._message_id(msg, folder)
         email_hash = self.generate_hash_id(message_id)

{classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/jira/source.py RENAMED Viewed

@@ -193,11 +193,28 @@ class JiraSource(BaseSource):
             return f"{query} ORDER BY updated DESC"
         return query
+    def _sorted_issues(self, issues: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        return sorted(
+            issues,
+            key=lambda issue: parse_datetime(
+                str(
+                    issue.get("fields", {}).get("updated")
+                    if isinstance(issue.get("fields"), dict)
+                    else ""
+                )
+            ),
+            reverse=True,
+        )
     def _sample_issues(self, issues: list[dict[str, Any]]) -> list[dict[str, Any]]:
         sampling = self.config.sampling
         if sampling.strategy == SamplingStrategy.ALL:
             return issues
+        if sampling.strategy == SamplingStrategy.AUTOMATIC:
+            # Newest-first stable order; window advances each run and wraps around.
+            return self.automatic_window(self._sorted_issues(issues), key="issues")
         limit = int(sampling.rows_per_page or 100)
         if limit >= len(issues):
             return issues
@@ -205,18 +222,7 @@ class JiraSource(BaseSource):
         if sampling.strategy == SamplingStrategy.RANDOM:
             return deterministic_sample(issues, limit)
-        sorted_issues = sorted(
-            issues,
-            key=lambda issue: parse_datetime(
-                str(
-                    issue.get("fields", {}).get("updated")
-                    if isinstance(issue.get("fields"), dict)
-                    else ""
-                )
-            ),
-            reverse=True,
-        )
-        return sorted_issues[:limit]
+        return self._sorted_issues(issues)[:limit]
     def _extract_issue_assets(self, issue: dict[str, Any]) -> list[SingleAssetScanResults]:
         fields = issue.get("fields", {})

{classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/mongodb/source.py RENAMED Viewed

@@ -407,6 +407,14 @@ class MongoDBSource(BaseSource):
         if strategy == SamplingStrategy.ALL:
             return list(collection.find({}).limit(rows_per_page))
+        if strategy == SamplingStrategy.AUTOMATIC:
+            # Page forward through the collection each run; wrap when exhausted.
+            key = f"collection:{collection_ref.database}.{collection_ref.collection}"
+            offset = self.automatic_offset(key)
+            documents = list(collection.find({}).skip(offset).limit(rows_per_page))
+            self.record_automatic_offset(key, prev_offset=offset, fetched=len(documents))
+            return documents
         if strategy == SamplingStrategy.RANDOM:
             return self._sample_random_documents(collection, rows_per_page)

{classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/neo4j/source.py RENAMED Viewed

@@ -392,6 +392,14 @@ class Neo4jSource(BaseSource):
         strategy = sampling.strategy
         rows = int(sampling.rows_per_page or 100)
+        if strategy == SamplingStrategy.AUTOMATIC:
+            # Page forward through this label's nodes each run; wrap when exhausted.
+            key = f"label:{ref.label}"
+            offset = self.automatic_offset(key)
+            page = self._fetch_nodes_page(ref, skip=offset, limit=rows)
+            self.record_automatic_offset(key, prev_offset=offset, fetched=len(page))
+            return page
         if strategy == SamplingStrategy.RANDOM:
             cypher = (
                 f"MATCH (n:{_escape_label(ref.label)}) "

{classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/notion/source.py RENAMED Viewed

@@ -338,11 +338,22 @@ class NotionSource(BaseSource):
             "edited": obj.get("last_edited_time") or obj.get("created_time") or "",
         }
+    def _sorted_refs(self, refs: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        return sorted(
+            refs,
+            key=lambda ref: parse_datetime(str(ref.get("edited") or "")),
+            reverse=True,
+        )
     def _sample_refs(self, refs: list[dict[str, Any]]) -> list[dict[str, Any]]:
         sampling = self.config.sampling
         if sampling.strategy == SamplingStrategy.ALL:
             return refs
+        if sampling.strategy == SamplingStrategy.AUTOMATIC:
+            # Newest-first stable order; window advances each run and wraps around.
+            return self.automatic_window(self._sorted_refs(refs), key="refs")
         limit = int(sampling.rows_per_page or 100)
         if limit >= len(refs):
             return refs
@@ -350,12 +361,7 @@ class NotionSource(BaseSource):
         if sampling.strategy == SamplingStrategy.RANDOM:
             return deterministic_sample(refs, limit)
-        refs_sorted = sorted(
-            refs,
-            key=lambda ref: parse_datetime(str(ref.get("edited") or "")),
-            reverse=True,
-        )
-        return refs_sorted[:limit]
+        return self._sorted_refs(refs)[:limit]
     # ------------------------------------------------------------------- pages
     def _extract_page_assets(self, page: dict[str, Any]) -> list[SingleAssetScanResults]:

{classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/object_storage/base.py RENAMED Viewed

@@ -271,6 +271,11 @@ class ObjectStorageSourceBase(BaseSource, ABC):
         materialized = list(refs)
+        if strategy == SamplingStrategy.AUTOMATIC:
+            # Newest-first stable order; window advances each run and wraps around.
+            materialized.sort(key=lambda ref: ref.last_modified, reverse=True)
+            return self.automatic_window(materialized, key="objects")
         if strategy == SamplingStrategy.RANDOM:
             if limit >= len(materialized):
                 return materialized

{classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/powerbi/source.py RENAMED Viewed

@@ -569,11 +569,28 @@ class PowerBISource(BaseSource):
                 return parsed
         return None
+    def _ordered_refs_for_automatic(
+        self, refs: list[PowerBIAssetRef], order_field: str
+    ) -> list[PowerBIAssetRef]:
+        values = [self._sampling_sort_datetime(ref, order_field) for ref in refs]
+        scored: list[tuple[bool, datetime, PowerBIAssetRef]] = []
+        for ref, parsed in zip(refs, values, strict=False):
+            effective = parsed or ref.updated_at
+            scored.append((parsed is not None, effective, ref))
+        scored.sort(key=lambda item: (item[0], item[1]), reverse=True)
+        return [item[2] for item in scored]
     def _sample_refs(self, refs: list[PowerBIAssetRef]) -> list[PowerBIAssetRef]:
         sampling = self._sampling()
         if sampling.strategy == SamplingStrategy.ALL:
             return refs
+        if sampling.strategy == SamplingStrategy.AUTOMATIC:
+            # Newest-first stable order; window advances each run and wraps around.
+            order_field = sampling.order_by_column or "modifiedDateTime"
+            ordered = self._ordered_refs_for_automatic(refs, order_field)
+            return self.automatic_window(ordered, key="refs")
         if sampling.strategy == SamplingStrategy.RANDOM:
             limit = int(sampling.rows_per_page or 100)
             if limit >= len(refs):

{classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/recipe_normalizer.py RENAMED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 from copy import deepcopy
 from typing import Any
-_VALID_SAMPLING_STRATEGIES = {"RANDOM", "LATEST", "ALL"}
+_VALID_SAMPLING_STRATEGIES = {"AUTOMATIC", "RANDOM", "LATEST", "ALL"}
 def _as_dict(value: Any) -> dict[str, Any]:
@@ -130,7 +130,7 @@ def normalize_source_recipe(
         _normalize_sampling_strategy(sampling.get("strategy")),
         _normalize_sampling_strategy(optional_sampling.get("strategy")),
         _normalize_sampling_strategy(optional_sampling.get("mode")),
-        "RANDOM",
+        "AUTOMATIC",
     )
     sampling["strategy"] = strategy

{classifyre_cli-0.4.29 → classifyre_cli-0.4.31}/src/sources/servicedesk/source.py RENAMED Viewed

@@ -204,6 +204,11 @@ class ServiceDeskSource(BaseSource):
         if sampling.strategy == SamplingStrategy.ALL:
             return requests
+        if sampling.strategy == SamplingStrategy.AUTOMATIC:
+            # Newest-first stable order; window advances each run and wraps around.
+            sorted_requests = sorted(requests, key=self._request_sort_timestamp, reverse=True)
+            return self.automatic_window(sorted_requests, key="requests")
         limit = int(sampling.rows_per_page or 100)
         if limit >= len(requests):
             return requests

classifyre-cli 0.4.29__tar.gz → 0.4.31__tar.gz

classifyre-cli 0.4.29tar.gz → 0.4.31tar.gz