PyPI - nosible - Versions diffs - 0.1.8__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

nosible 0.1.8py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

nosible/classes/result.py +69 -106
nosible/classes/result_set.py +121 -115
nosible/classes/search.py +83 -88
nosible/classes/search_set.py +27 -12
nosible/classes/snippet.py +57 -74
nosible/classes/snippet_set.py +62 -63
nosible/classes/web_page.py +39 -103
nosible/nosible_client.py +551 -234
nosible/utils/json_tools.py +58 -8
nosible/utils/question_builder.py +131 -0
nosible/utils/rate_limiter.py +30 -24
{nosible-0.1.8.dist-info → nosible-0.2.1.dist-info}/METADATA +27 -49
nosible-0.2.1.dist-info/RECORD +17 -0
nosible-0.1.8.dist-info/RECORD +0 -16
{nosible-0.1.8.dist-info → nosible-0.2.1.dist-info}/WHEEL +0 -0
{nosible-0.1.8.dist-info → nosible-0.2.1.dist-info}/licenses/LICENSE +0 -0
{nosible-0.1.8.dist-info → nosible-0.2.1.dist-info}/top_level.txt +0 -0

nosible/classes/result_set.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 from collections.abc import Iterator
+from dataclasses import dataclass, field
 import duckdb
 import pandas as pd
@@ -11,6 +12,7 @@ from nosible.classes.result import Result
 from nosible.utils.json_tools import json_dumps, json_loads
+@dataclass(frozen=True)
 class ResultSet(Iterator[Result]):
     """
     Container class for managing and processing a sequence of Result objects.
@@ -57,33 +59,10 @@ class ResultSet(Iterator[Result]):
         "url_hash",
     ]
-    def __init__(self, results: list[Result] | None = None) -> None:
-        self.results: list[Result] = results or []
-        self._index: int = 0
-    def _as_dicts(self):
-        """
-        Convert the ResultSet to a list of dictionaries.
-        Returns
-        -------
-        list of dict
-            List of dictionaries representing each Result.
-        """
-        # dataclass.asdict handles nested structures too
-        return [r.to_dict() for r in self.results]
-    def _as_columns(self):
-        """
-        Convert the ResultSet to a dictionary of lists, suitable for DataFrame creation.
-        Returns
-        -------
-        dict
-            Dictionary where keys are field names and values are lists of field values.
-        """
-        return {f: [getattr(r, f) for r in self.results] for f in self._FIELDS}
+    results: list[Result] = field(default_factory=list)
+    """ List of Result objects contained in this ResultSet."""
+    _index: int = field(default=0, init=False, repr=False, compare=False)
+    """ Internal index for iteration over results."""
     def __len__(self) -> int:
         """
@@ -116,8 +95,8 @@ class ResultSet(Iterator[Result]):
         >>> print(search_results)  # doctest: +NORMALIZE_WHITESPACE
         Idx | Similarity | Title
         ------------------------
-          0 |   0.95 | Example Domain
-          1 |   0.99 | OpenAI
+          0 |   0.95     | Example Domain
+          1 |   0.99     | OpenAI
         >>> empty = ResultSet([])
         >>> print(empty)
@@ -129,29 +108,18 @@ class ResultSet(Iterator[Result]):
         # Create a formatted string for each result
         lines = []
         for idx, result in enumerate(self.results):
-            similarity = f"{result.similarity:.2f}" if result.similarity is not None else "N/A"
+            similarity = f"{result.similarity:.2f}" if result.similarity is not None else "  N/A"
             title = result.title or "No Title"
-            lines.append(f"{idx:>3} | {similarity:>6} | {title}")
+            lines.append(f"{idx:>3} | {similarity:>10} | {title}")
-        # Add a header
-        header = "Idx | Similarity | Title"
+        # Add a header with matching column widths
+        header = f"{'Idx':>3} | {'Similarity':>10} | Title"
         separator = "-" * len(header)
         lines.insert(0, header)
         lines.insert(1, separator)
         # Join all lines into a single string
         return "\n".join(lines)
-    def __repr__(self) -> str:
-        """
-        Returns a string representation of the object for interactive sessions.
-        Returns
-        -------
-        str
-            The string representation of the object, as returned by `__str__()`.
-        """
-        return self.__str__()
     def __iter__(self) -> ResultSet:
         """
         Reset iteration and return self.
@@ -161,7 +129,7 @@ class ResultSet(Iterator[Result]):
         ResultSet
             Iterator over the ResultSet instance.
         """
-        self._index = 0
+        object.__setattr__(self, "_index", 0)
         return self
     def __next__(self) -> Result:
@@ -179,10 +147,29 @@ class ResultSet(Iterator[Result]):
         """
         if self._index < len(self.results):
             item = self.results[self._index]
-            self._index += 1
+            object.__setattr__(self, "_index", self._index + 1)
             return item
         raise StopIteration
+    def __eq__(self, value):
+        """
+        Comapre set of url_hashes to determine equality.
+        Two ResultSet instances are considered equal if they contain the same set of url_hashes.
+        Parameters
+        ----------
+        value : ResultSet
+            The ResultSet instance to compare against.
+        Returns
+        -------
+        bool
+            True if both ResultSet instances contain the same set of url_hashes, False otherwise.
+        """
+        if not isinstance(value, ResultSet):
+            return False
+        # Compare the sets of url_hashes
+        return {r.url_hash for r in self.results} == {r.url_hash for r in value.results}
     def __enter__(self) -> ResultSet:
         """
         Enters the runtime context related to this object.
@@ -218,28 +205,7 @@ class ResultSet(Iterator[Result]):
             return self.results[key]
         raise IndexError(f"Index {key} out of range for ResultSet with length {len(self.results)}.")
-    def __setitem__(self, key: int, value: Result) -> None:
-        """
-        Set a Result at a specific index.
-        Parameters
-        ----------
-        key : int
-            Index to set the result at.
-        value : Result
-            Result to set at the specified index.
-        Raises
-        ------
-        IndexError
-            If index is out of range.
-        """
-        if 0 <= key < len(self.results):
-            self.results[key] = value
-        else:
-            raise IndexError(f"Index {key} out of range for ResultSet with length {len(self.results)}.")
-    def __add__(self, other: ResultSet) -> ResultSet:
+    def __add__(self, other: ResultSet | Result) -> ResultSet:
         """
         Concatenate two ResultSet instances.
@@ -265,9 +231,12 @@ class ResultSet(Iterator[Result]):
         >>> len(combined)
         2
         """
-        if not isinstance(other, ResultSet):
-            raise TypeError("Can only concatenate ResultSet with another ResultSet.")
-        return ResultSet(self.results + other.results)
+        if isinstance(other, ResultSet):
+            return ResultSet(self.results + other.results)
+        if isinstance(other, Result):
+            # If other is a single Result, create a new ResultSet with it
+            return ResultSet(self.results.append(other))
+        raise TypeError("Can only concatenate ResultSet with another ResultSet.")
     def __sub__(self, other: ResultSet) -> ResultSet:
         """
@@ -315,13 +284,14 @@ class ResultSet(Iterator[Result]):
     def find_in_search_results(self, query: str, top_k: int = 10) -> ResultSet:
         """
-        Perform an in-memory search over a ResultSet collection using Tantivy.
+        This allows you to search within the results of a search using BM25 scoring by
+        performing an in-memory search over a ResultSet collection using Tantivy.
         Parameters
         ----------
         query : str
-            The search string to rank within these results.
-        top_k : int, default=10
+            The search string you want to find within these results.
+        top_k : int
             Number of top results to return.
         Returns
@@ -435,6 +405,39 @@ class ResultSet(Iterator[Result]):
         ...     summary = results.analyze(by="language")
         ...     print(summary)
         {'en': 100}
+        >>> import polars as pl
+        >>> from nosible.classes.result_set import Result, ResultSet
+        # -- date grouping (published) --------------------------------------------
+        >>> data = [
+        ...     {"published": "2021-01-15", "netloc": "a.com", "author": "", "language": "en", "similarity": 0.5},
+        ...     {"published": "2021-02-20", "netloc": "a.com", "author": "", "language": "en", "similarity": 0.8},
+        ...     {"published": "2021-02-25", "netloc": "b.org", "author": "", "language": "fr", "similarity": 0.2},
+        ... ]
+        >>> results = ResultSet([Result(**d) for d in data])
+        >>> results.analyze(by="published")  # doctest: +NORMALIZE_WHITESPACE
+        {'2021-01': 1, '2021-02': 2}
+        # -- numeric stats (similarity) ------------------------------------------
+        >>> stats = results.analyze(by="similarity")
+        >>> set(stats) == {"count", "null_count", "mean", "std", "min", "25%", "50%", "75%", "max"}
+        True
+        >>> round(stats["mean"], 2)
+        0.5
+        # -- categorical counts (language) --------------------------------------
+        >>> results.analyze(by="language")
+        {'en': 2, 'fr': 1}
+        # -- author special case ------------------------------------------------
+        # empty author strings get mapped to "Author Unknown"
+        >>> results.analyze(by="author")
+        {'Author Unknown': 3}
+        # -- invalid field -------------------------------------------------------
+        >>> results.analyze(by="foobar")  # doctest: +IGNORE_EXCEPTION_DETAIL
+        Traceback (most recent call last):
+        ValueError: Cannot analyze by 'foobar' - not a valid field.
         """
         # Convert to Polars DataFrame
         df: pl.DataFrame = self.to_polars()
@@ -451,7 +454,7 @@ class ResultSet(Iterator[Result]):
         # Handle author unknown
         if by == "author":
             df = df.with_columns(
-                pl.when(pl.col("author").str.strip() == "")
+                pl.when(pl.col("author") == "")
                 .then(pl.lit("Author Unknown"))
                 .otherwise(pl.col("author"))
                 .alias("author")
@@ -464,7 +467,7 @@ class ResultSet(Iterator[Result]):
             # Extract year-month
             df = df.with_columns(pl.col(by).dt.strftime("%Y-%m").alias("year_month"))
             # Count per month
-            vc = df.groupby("year_month").agg(pl.count().alias("count")).sort("year_month")
+            vc = df.group_by("year_month").agg(pl.len().alias("count")).sort("year_month")
             rows = vc.rows()
             if not rows:
                 return {}
@@ -477,13 +480,15 @@ class ResultSet(Iterator[Result]):
                 result[month] = cnt
             return result
+        # Numeric stats for similarity
+        if by == "similarity":
+            desc_df = df["similarity"].describe()
+            # print({row[0]: float(row[1]) for row in desc_df.rows()})
+            return {row[0]: float(row[1]) for row in desc_df.rows()}
         # Non-date: analyze numeric vs. categorical Non-date: analyze numeric vs. categorical
         series = df[by]
-        dtype = series.dtype
-        # Numeric analysis: descriptive stats
-        if dtype in (pl.Float64, pl.Float32, pl.Int64, pl.Int32):
-            desc_df = series.describe()
-            return {row[0]: float(row[1]) for row in desc_df.rows()}
         # Categorical/value counts
         vc = series.value_counts()
         _, count_col = vc.columns
@@ -502,11 +507,11 @@ class ResultSet(Iterator[Result]):
         Parameters
         ----------
         file_path : str or None, optional
-            Path to save the CSV file. If None, defaults to "search_results.csv".
+            Path to save the CSV file.
         delimiter : str, optional
-            Delimiter to use in the CSV file. Default is ','.
+            Delimiter to use in the CSV file.
         encoding : str, optional
-            Encoding for the CSV file. Default is "utf-8".
+            Encoding for the CSV file.
         Returns
         -------
@@ -566,7 +571,7 @@ class ResultSet(Iterator[Result]):
         >>> "url" in df.columns
         True
         """
-        return pl.DataFrame(self._as_columns())
+        return pl.DataFrame(self.to_dicts())
     def to_pandas(self) -> pd.DataFrame:
         """
@@ -602,7 +607,7 @@ class ResultSet(Iterator[Result]):
         except Exception as e:
             raise RuntimeError(f"Failed to convert search results to Pandas DataFrame: {e}") from e
-    def to_json(self, file_path: str | None = None) -> str:
+    def to_json(self, file_path: str | None = None) -> str | bytes:
         """
         Serialize the search results to a JSON string and optionally write to disk.
@@ -637,7 +642,7 @@ class ResultSet(Iterator[Result]):
         True
         """
         try:
-            json_bytes = json_dumps(self._as_dicts())
+            json_bytes = json_dumps(self.to_dicts())
             if file_path:
                 try:
                     with open(file_path, "w") as f:
@@ -684,9 +689,9 @@ class ResultSet(Iterator[Result]):
         True
         """
         try:
-            return self._as_dicts()
+            return [result.to_dict() for result in self.results]
         except Exception as e:
-            raise RuntimeError(f"Failed to convert results to list of dicts: {e}") from e
+            raise RuntimeError(f"Failed to convert results to list of dictionaries: {e}") from e
     def to_dict(self) -> dict:
         """
@@ -738,7 +743,6 @@ class ResultSet(Iterator[Result]):
         ----------
         file_path : str or None, optional
             Path to save the NDJSON file. If None, returns the NDJSON string.
-            Default is None.
         Returns
         -------
@@ -766,18 +770,22 @@ class ResultSet(Iterator[Result]):
         >>> path.endswith(".ndjson")
         True
         """
-        try:
-            lines = "\n".join(json_dumps(d) for d in self._as_dicts())
-            if file_path:
-                try:
-                    with open(file_path, "w") as f:
-                        f.write(lines)
-                    return file_path
-                except Exception as e:
-                    raise RuntimeError(f"Failed to write NDJSON to '{file_path}': {e}") from e
-            return lines
-        except Exception as e:
-            raise RuntimeError(f"Failed to serialize results to NDJSON: {e}") from e
+        ndjson_lines = []
+        for result in self.results:
+            try:
+                ndjson_lines.append(json_dumps(result.to_dict()))
+            except Exception as e:
+                raise RuntimeError(f"Failed to serialize Result to NDJSON: {e}") from e
+        if file_path:
+            try:
+                with open(file_path, "w", encoding="utf-8") as f:
+                    f.write("\n".join(ndjson_lines) + "\n")
+                return file_path
+            except Exception as e:
+                raise RuntimeError(f"Failed to write NDJSON to '{file_path}': {e}") from e
+        return "\n".join(ndjson_lines) + "\n"
     def to_parquet(self, file_path: str | None = None) -> str:
         """
@@ -789,7 +797,7 @@ class ResultSet(Iterator[Result]):
         Parameters
         ----------
         file_path : str or None, optional
-            Path to save the Parquet file. If None, defaults to "results.parquet".
+            Path to save the Parquet file.
         Returns
         -------
@@ -830,7 +838,7 @@ class ResultSet(Iterator[Result]):
         Parameters
         ----------
         file_path : str or None, optional
-            Path to save the Arrow IPC file. If None, defaults to "results.arrow".
+            Path to save the Arrow IPC file.
         Returns
         -------
@@ -872,9 +880,9 @@ class ResultSet(Iterator[Result]):
         Parameters
         ----------
         file_path : str or None, optional
-            Path to save the DuckDB file. If None, defaults to "results.duckdb".
+            Path to save the DuckDB file.
         table_name : str, optional
-            Name of the table to write the results to. Default is "results".
+            Name of the table to write the results to.
         Returns
         -------
@@ -1006,11 +1014,6 @@ class ResultSet(Iterator[Result]):
         --------
         >>> import json
         >>> from nosible import ResultSet
-        >>> # Suppose 'data.json' contains:
-        >>> # [
-        >>> #   {"url": "https://example.com", "title": "Example Domain"},
-        >>> #   {"url": "https://openai.com", "title": "OpenAI"}
-        >>> # ]
         >>> with open("data.json", "w") as f:
         ...     json.dump(
         ...         [
@@ -1097,20 +1100,23 @@ class ResultSet(Iterator[Result]):
     @classmethod
     def from_pandas(cls, df: pd.DataFrame) -> ResultSet:
-        """Create a ResultSet instance from a pandas DataFrame.
+        """
+        Create a ResultSet instance from a pandas DataFrame.
         This class method converts a given pandas DataFrame to a Polars DataFrame
         and then constructs a ResultSet object from it. This is useful for
         integrating with workflows that use pandas for data manipulation.
         Parameters
         ----------
         df : pandas.DataFrame
-            DataFrame containing the search result fields. Each row should represent
-            a single search result, with columns corresponding to the expected fields
-            of ResultSet.
+            DataFrame containing the search result fields. Each row should represent a single search result, with
+            columns corresponding to the expected fields of ResultSet.
         Returns
         -------
         ResultSet
             An instance of ResultSet containing the data from the input DataFrame.
         Examples
         --------
         >>> data = [{"url": "https://example.com", "title": "Example"}]

nosible 0.1.8__py3-none-any.whl → 0.2.1__py3-none-any.whl

nosible 0.1.8py3-none-any.whl → 0.2.1py3-none-any.whl