nosible 0.1.8__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from collections.abc import Iterator
4
+ from dataclasses import dataclass, field
4
5
 
5
6
  import duckdb
6
7
  import pandas as pd
@@ -11,6 +12,7 @@ from nosible.classes.result import Result
11
12
  from nosible.utils.json_tools import json_dumps, json_loads
12
13
 
13
14
 
15
+ @dataclass(frozen=True)
14
16
  class ResultSet(Iterator[Result]):
15
17
  """
16
18
  Container class for managing and processing a sequence of Result objects.
@@ -57,33 +59,10 @@ class ResultSet(Iterator[Result]):
57
59
  "url_hash",
58
60
  ]
59
61
 
60
- def __init__(self, results: list[Result] | None = None) -> None:
61
- self.results: list[Result] = results or []
62
- self._index: int = 0
63
-
64
- def _as_dicts(self):
65
- """
66
- Convert the ResultSet to a list of dictionaries.
67
-
68
- Returns
69
- -------
70
- list of dict
71
- List of dictionaries representing each Result.
72
-
73
- """
74
- # dataclass.asdict handles nested structures too
75
- return [r.to_dict() for r in self.results]
76
-
77
- def _as_columns(self):
78
- """
79
- Convert the ResultSet to a dictionary of lists, suitable for DataFrame creation.
80
-
81
- Returns
82
- -------
83
- dict
84
- Dictionary where keys are field names and values are lists of field values.
85
- """
86
- return {f: [getattr(r, f) for r in self.results] for f in self._FIELDS}
62
+ results: list[Result] = field(default_factory=list)
63
+ """ List of Result objects contained in this ResultSet."""
64
+ _index: int = field(default=0, init=False, repr=False, compare=False)
65
+ """ Internal index for iteration over results."""
87
66
 
88
67
  def __len__(self) -> int:
89
68
  """
@@ -116,8 +95,8 @@ class ResultSet(Iterator[Result]):
116
95
  >>> print(search_results) # doctest: +NORMALIZE_WHITESPACE
117
96
  Idx | Similarity | Title
118
97
  ------------------------
119
- 0 | 0.95 | Example Domain
120
- 1 | 0.99 | OpenAI
98
+ 0 | 0.95 | Example Domain
99
+ 1 | 0.99 | OpenAI
121
100
 
122
101
  >>> empty = ResultSet([])
123
102
  >>> print(empty)
@@ -129,29 +108,18 @@ class ResultSet(Iterator[Result]):
129
108
  # Create a formatted string for each result
130
109
  lines = []
131
110
  for idx, result in enumerate(self.results):
132
- similarity = f"{result.similarity:.2f}" if result.similarity is not None else "N/A"
111
+ similarity = f"{result.similarity:.2f}" if result.similarity is not None else " N/A"
133
112
  title = result.title or "No Title"
134
- lines.append(f"{idx:>3} | {similarity:>6} | {title}")
113
+ lines.append(f"{idx:>3} | {similarity:>10} | {title}")
135
114
 
136
- # Add a header
137
- header = "Idx | Similarity | Title"
115
+ # Add a header with matching column widths
116
+ header = f"{'Idx':>3} | {'Similarity':>10} | Title"
138
117
  separator = "-" * len(header)
139
118
  lines.insert(0, header)
140
119
  lines.insert(1, separator)
141
120
  # Join all lines into a single string
142
121
  return "\n".join(lines)
143
122
 
144
- def __repr__(self) -> str:
145
- """
146
- Returns a string representation of the object for interactive sessions.
147
-
148
- Returns
149
- -------
150
- str
151
- The string representation of the object, as returned by `__str__()`.
152
- """
153
- return self.__str__()
154
-
155
123
  def __iter__(self) -> ResultSet:
156
124
  """
157
125
  Reset iteration and return self.
@@ -161,7 +129,7 @@ class ResultSet(Iterator[Result]):
161
129
  ResultSet
162
130
  Iterator over the ResultSet instance.
163
131
  """
164
- self._index = 0
132
+ object.__setattr__(self, "_index", 0)
165
133
  return self
166
134
 
167
135
  def __next__(self) -> Result:
@@ -179,10 +147,29 @@ class ResultSet(Iterator[Result]):
179
147
  """
180
148
  if self._index < len(self.results):
181
149
  item = self.results[self._index]
182
- self._index += 1
150
+ object.__setattr__(self, "_index", self._index + 1)
183
151
  return item
184
152
  raise StopIteration
185
153
 
154
+ def __eq__(self, value):
155
+ """
156
+ Comapre set of url_hashes to determine equality.
157
+ Two ResultSet instances are considered equal if they contain the same set of url_hashes.
158
+
159
+ Parameters
160
+ ----------
161
+ value : ResultSet
162
+ The ResultSet instance to compare against.
163
+ Returns
164
+ -------
165
+ bool
166
+ True if both ResultSet instances contain the same set of url_hashes, False otherwise.
167
+ """
168
+ if not isinstance(value, ResultSet):
169
+ return False
170
+ # Compare the sets of url_hashes
171
+ return {r.url_hash for r in self.results} == {r.url_hash for r in value.results}
172
+
186
173
  def __enter__(self) -> ResultSet:
187
174
  """
188
175
  Enters the runtime context related to this object.
@@ -218,28 +205,7 @@ class ResultSet(Iterator[Result]):
218
205
  return self.results[key]
219
206
  raise IndexError(f"Index {key} out of range for ResultSet with length {len(self.results)}.")
220
207
 
221
- def __setitem__(self, key: int, value: Result) -> None:
222
- """
223
- Set a Result at a specific index.
224
-
225
- Parameters
226
- ----------
227
- key : int
228
- Index to set the result at.
229
- value : Result
230
- Result to set at the specified index.
231
-
232
- Raises
233
- ------
234
- IndexError
235
- If index is out of range.
236
- """
237
- if 0 <= key < len(self.results):
238
- self.results[key] = value
239
- else:
240
- raise IndexError(f"Index {key} out of range for ResultSet with length {len(self.results)}.")
241
-
242
- def __add__(self, other: ResultSet) -> ResultSet:
208
+ def __add__(self, other: ResultSet | Result) -> ResultSet:
243
209
  """
244
210
  Concatenate two ResultSet instances.
245
211
 
@@ -265,9 +231,12 @@ class ResultSet(Iterator[Result]):
265
231
  >>> len(combined)
266
232
  2
267
233
  """
268
- if not isinstance(other, ResultSet):
269
- raise TypeError("Can only concatenate ResultSet with another ResultSet.")
270
- return ResultSet(self.results + other.results)
234
+ if isinstance(other, ResultSet):
235
+ return ResultSet(self.results + other.results)
236
+ if isinstance(other, Result):
237
+ # If other is a single Result, create a new ResultSet with it
238
+ return ResultSet(self.results.append(other))
239
+ raise TypeError("Can only concatenate ResultSet with another ResultSet.")
271
240
 
272
241
  def __sub__(self, other: ResultSet) -> ResultSet:
273
242
  """
@@ -315,13 +284,14 @@ class ResultSet(Iterator[Result]):
315
284
 
316
285
  def find_in_search_results(self, query: str, top_k: int = 10) -> ResultSet:
317
286
  """
318
- Perform an in-memory search over a ResultSet collection using Tantivy.
287
+ This allows you to search within the results of a search using BM25 scoring by
288
+ performing an in-memory search over a ResultSet collection using Tantivy.
319
289
 
320
290
  Parameters
321
291
  ----------
322
292
  query : str
323
- The search string to rank within these results.
324
- top_k : int, default=10
293
+ The search string you want to find within these results.
294
+ top_k : int
325
295
  Number of top results to return.
326
296
 
327
297
  Returns
@@ -435,6 +405,39 @@ class ResultSet(Iterator[Result]):
435
405
  ... summary = results.analyze(by="language")
436
406
  ... print(summary)
437
407
  {'en': 100}
408
+ >>> import polars as pl
409
+ >>> from nosible.classes.result_set import Result, ResultSet
410
+
411
+ # -- date grouping (published) --------------------------------------------
412
+ >>> data = [
413
+ ... {"published": "2021-01-15", "netloc": "a.com", "author": "", "language": "en", "similarity": 0.5},
414
+ ... {"published": "2021-02-20", "netloc": "a.com", "author": "", "language": "en", "similarity": 0.8},
415
+ ... {"published": "2021-02-25", "netloc": "b.org", "author": "", "language": "fr", "similarity": 0.2},
416
+ ... ]
417
+ >>> results = ResultSet([Result(**d) for d in data])
418
+ >>> results.analyze(by="published") # doctest: +NORMALIZE_WHITESPACE
419
+ {'2021-01': 1, '2021-02': 2}
420
+
421
+ # -- numeric stats (similarity) ------------------------------------------
422
+ >>> stats = results.analyze(by="similarity")
423
+ >>> set(stats) == {"count", "null_count", "mean", "std", "min", "25%", "50%", "75%", "max"}
424
+ True
425
+ >>> round(stats["mean"], 2)
426
+ 0.5
427
+
428
+ # -- categorical counts (language) --------------------------------------
429
+ >>> results.analyze(by="language")
430
+ {'en': 2, 'fr': 1}
431
+
432
+ # -- author special case ------------------------------------------------
433
+ # empty author strings get mapped to "Author Unknown"
434
+ >>> results.analyze(by="author")
435
+ {'Author Unknown': 3}
436
+
437
+ # -- invalid field -------------------------------------------------------
438
+ >>> results.analyze(by="foobar") # doctest: +IGNORE_EXCEPTION_DETAIL
439
+ Traceback (most recent call last):
440
+ ValueError: Cannot analyze by 'foobar' - not a valid field.
438
441
  """
439
442
  # Convert to Polars DataFrame
440
443
  df: pl.DataFrame = self.to_polars()
@@ -451,7 +454,7 @@ class ResultSet(Iterator[Result]):
451
454
  # Handle author unknown
452
455
  if by == "author":
453
456
  df = df.with_columns(
454
- pl.when(pl.col("author").str.strip() == "")
457
+ pl.when(pl.col("author") == "")
455
458
  .then(pl.lit("Author Unknown"))
456
459
  .otherwise(pl.col("author"))
457
460
  .alias("author")
@@ -464,7 +467,7 @@ class ResultSet(Iterator[Result]):
464
467
  # Extract year-month
465
468
  df = df.with_columns(pl.col(by).dt.strftime("%Y-%m").alias("year_month"))
466
469
  # Count per month
467
- vc = df.groupby("year_month").agg(pl.count().alias("count")).sort("year_month")
470
+ vc = df.group_by("year_month").agg(pl.len().alias("count")).sort("year_month")
468
471
  rows = vc.rows()
469
472
  if not rows:
470
473
  return {}
@@ -477,13 +480,15 @@ class ResultSet(Iterator[Result]):
477
480
  result[month] = cnt
478
481
  return result
479
482
 
483
+ # Numeric stats for similarity
484
+ if by == "similarity":
485
+ desc_df = df["similarity"].describe()
486
+ # print({row[0]: float(row[1]) for row in desc_df.rows()})
487
+ return {row[0]: float(row[1]) for row in desc_df.rows()}
488
+
480
489
  # Non-date: analyze numeric vs. categorical Non-date: analyze numeric vs. categorical
481
490
  series = df[by]
482
- dtype = series.dtype
483
- # Numeric analysis: descriptive stats
484
- if dtype in (pl.Float64, pl.Float32, pl.Int64, pl.Int32):
485
- desc_df = series.describe()
486
- return {row[0]: float(row[1]) for row in desc_df.rows()}
491
+
487
492
  # Categorical/value counts
488
493
  vc = series.value_counts()
489
494
  _, count_col = vc.columns
@@ -502,11 +507,11 @@ class ResultSet(Iterator[Result]):
502
507
  Parameters
503
508
  ----------
504
509
  file_path : str or None, optional
505
- Path to save the CSV file. If None, defaults to "search_results.csv".
510
+ Path to save the CSV file.
506
511
  delimiter : str, optional
507
- Delimiter to use in the CSV file. Default is ','.
512
+ Delimiter to use in the CSV file.
508
513
  encoding : str, optional
509
- Encoding for the CSV file. Default is "utf-8".
514
+ Encoding for the CSV file.
510
515
 
511
516
  Returns
512
517
  -------
@@ -566,7 +571,7 @@ class ResultSet(Iterator[Result]):
566
571
  >>> "url" in df.columns
567
572
  True
568
573
  """
569
- return pl.DataFrame(self._as_columns())
574
+ return pl.DataFrame(self.to_dicts())
570
575
 
571
576
  def to_pandas(self) -> pd.DataFrame:
572
577
  """
@@ -602,7 +607,7 @@ class ResultSet(Iterator[Result]):
602
607
  except Exception as e:
603
608
  raise RuntimeError(f"Failed to convert search results to Pandas DataFrame: {e}") from e
604
609
 
605
- def to_json(self, file_path: str | None = None) -> str:
610
+ def to_json(self, file_path: str | None = None) -> str | bytes:
606
611
  """
607
612
  Serialize the search results to a JSON string and optionally write to disk.
608
613
 
@@ -637,7 +642,7 @@ class ResultSet(Iterator[Result]):
637
642
  True
638
643
  """
639
644
  try:
640
- json_bytes = json_dumps(self._as_dicts())
645
+ json_bytes = json_dumps(self.to_dicts())
641
646
  if file_path:
642
647
  try:
643
648
  with open(file_path, "w") as f:
@@ -684,9 +689,9 @@ class ResultSet(Iterator[Result]):
684
689
  True
685
690
  """
686
691
  try:
687
- return self._as_dicts()
692
+ return [result.to_dict() for result in self.results]
688
693
  except Exception as e:
689
- raise RuntimeError(f"Failed to convert results to list of dicts: {e}") from e
694
+ raise RuntimeError(f"Failed to convert results to list of dictionaries: {e}") from e
690
695
 
691
696
  def to_dict(self) -> dict:
692
697
  """
@@ -738,7 +743,6 @@ class ResultSet(Iterator[Result]):
738
743
  ----------
739
744
  file_path : str or None, optional
740
745
  Path to save the NDJSON file. If None, returns the NDJSON string.
741
- Default is None.
742
746
 
743
747
  Returns
744
748
  -------
@@ -766,18 +770,22 @@ class ResultSet(Iterator[Result]):
766
770
  >>> path.endswith(".ndjson")
767
771
  True
768
772
  """
769
- try:
770
- lines = "\n".join(json_dumps(d) for d in self._as_dicts())
771
- if file_path:
772
- try:
773
- with open(file_path, "w") as f:
774
- f.write(lines)
775
- return file_path
776
- except Exception as e:
777
- raise RuntimeError(f"Failed to write NDJSON to '{file_path}': {e}") from e
778
- return lines
779
- except Exception as e:
780
- raise RuntimeError(f"Failed to serialize results to NDJSON: {e}") from e
773
+
774
+ ndjson_lines = []
775
+ for result in self.results:
776
+ try:
777
+ ndjson_lines.append(json_dumps(result.to_dict()))
778
+ except Exception as e:
779
+ raise RuntimeError(f"Failed to serialize Result to NDJSON: {e}") from e
780
+
781
+ if file_path:
782
+ try:
783
+ with open(file_path, "w", encoding="utf-8") as f:
784
+ f.write("\n".join(ndjson_lines) + "\n")
785
+ return file_path
786
+ except Exception as e:
787
+ raise RuntimeError(f"Failed to write NDJSON to '{file_path}': {e}") from e
788
+ return "\n".join(ndjson_lines) + "\n"
781
789
 
782
790
  def to_parquet(self, file_path: str | None = None) -> str:
783
791
  """
@@ -789,7 +797,7 @@ class ResultSet(Iterator[Result]):
789
797
  Parameters
790
798
  ----------
791
799
  file_path : str or None, optional
792
- Path to save the Parquet file. If None, defaults to "results.parquet".
800
+ Path to save the Parquet file.
793
801
 
794
802
  Returns
795
803
  -------
@@ -830,7 +838,7 @@ class ResultSet(Iterator[Result]):
830
838
  Parameters
831
839
  ----------
832
840
  file_path : str or None, optional
833
- Path to save the Arrow IPC file. If None, defaults to "results.arrow".
841
+ Path to save the Arrow IPC file.
834
842
 
835
843
  Returns
836
844
  -------
@@ -872,9 +880,9 @@ class ResultSet(Iterator[Result]):
872
880
  Parameters
873
881
  ----------
874
882
  file_path : str or None, optional
875
- Path to save the DuckDB file. If None, defaults to "results.duckdb".
883
+ Path to save the DuckDB file.
876
884
  table_name : str, optional
877
- Name of the table to write the results to. Default is "results".
885
+ Name of the table to write the results to.
878
886
 
879
887
  Returns
880
888
  -------
@@ -1006,11 +1014,6 @@ class ResultSet(Iterator[Result]):
1006
1014
  --------
1007
1015
  >>> import json
1008
1016
  >>> from nosible import ResultSet
1009
- >>> # Suppose 'data.json' contains:
1010
- >>> # [
1011
- >>> # {"url": "https://example.com", "title": "Example Domain"},
1012
- >>> # {"url": "https://openai.com", "title": "OpenAI"}
1013
- >>> # ]
1014
1017
  >>> with open("data.json", "w") as f:
1015
1018
  ... json.dump(
1016
1019
  ... [
@@ -1097,20 +1100,23 @@ class ResultSet(Iterator[Result]):
1097
1100
 
1098
1101
  @classmethod
1099
1102
  def from_pandas(cls, df: pd.DataFrame) -> ResultSet:
1100
- """Create a ResultSet instance from a pandas DataFrame.
1103
+ """
1104
+ Create a ResultSet instance from a pandas DataFrame.
1101
1105
  This class method converts a given pandas DataFrame to a Polars DataFrame
1102
1106
  and then constructs a ResultSet object from it. This is useful for
1103
1107
  integrating with workflows that use pandas for data manipulation.
1108
+
1104
1109
  Parameters
1105
1110
  ----------
1106
1111
  df : pandas.DataFrame
1107
- DataFrame containing the search result fields. Each row should represent
1108
- a single search result, with columns corresponding to the expected fields
1109
- of ResultSet.
1112
+ DataFrame containing the search result fields. Each row should represent a single search result, with
1113
+ columns corresponding to the expected fields of ResultSet.
1114
+
1110
1115
  Returns
1111
1116
  -------
1112
1117
  ResultSet
1113
1118
  An instance of ResultSet containing the data from the input DataFrame.
1119
+
1114
1120
  Examples
1115
1121
  --------
1116
1122
  >>> data = [{"url": "https://example.com", "title": "Example"}]