everyrow 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,174 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from collections.abc import Mapping
4
- from typing import Any, TypeVar, cast
5
-
6
- from attrs import define as _attrs_define
7
- from attrs import field as _attrs_field
8
-
9
- from ..models.dedupe_mode import DedupeMode
10
- from ..models.embedding_models import EmbeddingModels
11
- from ..models.llm_enum import LLMEnum
12
- from ..types import UNSET, Unset
13
-
14
- T = TypeVar("T", bound="DedupeQueryParams")
15
-
16
-
17
- @_attrs_define
18
- class DedupeQueryParams:
19
- """Service-specific parameters for the deduplication service.
20
-
21
- Attributes:
22
- equivalence_relation (str): Description of what makes items equivalent
23
- llm (LLMEnum | Unset):
24
- chunk_size (int | Unset): Maximum number of items to process in a single LLM call Default: 25.
25
- mode (DedupeMode | Unset):
26
- preview (bool | Unset): When true, process only the first few items Default: False.
27
- embedding_model (EmbeddingModels | Unset):
28
- validate_groups (bool | Unset): Validate equivalence classes and split incorrectly merged groups before
29
- selecting representatives Default: False.
30
- use_clustering (bool | Unset): When true, cluster items by embedding similarity and only compare neighboring
31
- clusters. When false, use sequential chunking and compare all chunks (O(n²)) Default: True.
32
- early_stop_threshold (int | None | Unset): Stop cross-chunk comparisons for a row after this many consecutive
33
- comparisons with no matches. None disables early stopping.
34
- """
35
-
36
- equivalence_relation: str
37
- llm: LLMEnum | Unset = UNSET
38
- chunk_size: int | Unset = 25
39
- mode: DedupeMode | Unset = UNSET
40
- preview: bool | Unset = False
41
- embedding_model: EmbeddingModels | Unset = UNSET
42
- validate_groups: bool | Unset = False
43
- use_clustering: bool | Unset = True
44
- early_stop_threshold: int | None | Unset = UNSET
45
- additional_properties: dict[str, Any] = _attrs_field(init=False, factory=dict)
46
-
47
- def to_dict(self) -> dict[str, Any]:
48
- equivalence_relation = self.equivalence_relation
49
-
50
- llm: str | Unset = UNSET
51
- if not isinstance(self.llm, Unset):
52
- llm = self.llm.value
53
-
54
- chunk_size = self.chunk_size
55
-
56
- mode: str | Unset = UNSET
57
- if not isinstance(self.mode, Unset):
58
- mode = self.mode.value
59
-
60
- preview = self.preview
61
-
62
- embedding_model: str | Unset = UNSET
63
- if not isinstance(self.embedding_model, Unset):
64
- embedding_model = self.embedding_model.value
65
-
66
- validate_groups = self.validate_groups
67
-
68
- use_clustering = self.use_clustering
69
-
70
- early_stop_threshold: int | None | Unset
71
- if isinstance(self.early_stop_threshold, Unset):
72
- early_stop_threshold = UNSET
73
- else:
74
- early_stop_threshold = self.early_stop_threshold
75
-
76
- field_dict: dict[str, Any] = {}
77
- field_dict.update(self.additional_properties)
78
- field_dict.update(
79
- {
80
- "equivalence_relation": equivalence_relation,
81
- }
82
- )
83
- if llm is not UNSET:
84
- field_dict["llm"] = llm
85
- if chunk_size is not UNSET:
86
- field_dict["chunk_size"] = chunk_size
87
- if mode is not UNSET:
88
- field_dict["mode"] = mode
89
- if preview is not UNSET:
90
- field_dict["preview"] = preview
91
- if embedding_model is not UNSET:
92
- field_dict["embedding_model"] = embedding_model
93
- if validate_groups is not UNSET:
94
- field_dict["validate_groups"] = validate_groups
95
- if use_clustering is not UNSET:
96
- field_dict["use_clustering"] = use_clustering
97
- if early_stop_threshold is not UNSET:
98
- field_dict["early_stop_threshold"] = early_stop_threshold
99
-
100
- return field_dict
101
-
102
- @classmethod
103
- def from_dict(cls: type[T], src_dict: Mapping[str, Any]) -> T:
104
- d = dict(src_dict)
105
- equivalence_relation = d.pop("equivalence_relation")
106
-
107
- _llm = d.pop("llm", UNSET)
108
- llm: LLMEnum | Unset
109
- if isinstance(_llm, Unset):
110
- llm = UNSET
111
- else:
112
- llm = LLMEnum(_llm)
113
-
114
- chunk_size = d.pop("chunk_size", UNSET)
115
-
116
- _mode = d.pop("mode", UNSET)
117
- mode: DedupeMode | Unset
118
- if isinstance(_mode, Unset):
119
- mode = UNSET
120
- else:
121
- mode = DedupeMode(_mode)
122
-
123
- preview = d.pop("preview", UNSET)
124
-
125
- _embedding_model = d.pop("embedding_model", UNSET)
126
- embedding_model: EmbeddingModels | Unset
127
- if isinstance(_embedding_model, Unset):
128
- embedding_model = UNSET
129
- else:
130
- embedding_model = EmbeddingModels(_embedding_model)
131
-
132
- validate_groups = d.pop("validate_groups", UNSET)
133
-
134
- use_clustering = d.pop("use_clustering", UNSET)
135
-
136
- def _parse_early_stop_threshold(data: object) -> int | None | Unset:
137
- if data is None:
138
- return data
139
- if isinstance(data, Unset):
140
- return data
141
- return cast(int | None | Unset, data)
142
-
143
- early_stop_threshold = _parse_early_stop_threshold(d.pop("early_stop_threshold", UNSET))
144
-
145
- dedupe_query_params = cls(
146
- equivalence_relation=equivalence_relation,
147
- llm=llm,
148
- chunk_size=chunk_size,
149
- mode=mode,
150
- preview=preview,
151
- embedding_model=embedding_model,
152
- validate_groups=validate_groups,
153
- use_clustering=use_clustering,
154
- early_stop_threshold=early_stop_threshold,
155
- )
156
-
157
- dedupe_query_params.additional_properties = d
158
- return dedupe_query_params
159
-
160
- @property
161
- def additional_keys(self) -> list[str]:
162
- return list(self.additional_properties.keys())
163
-
164
- def __getitem__(self, key: str) -> Any:
165
- return self.additional_properties[key]
166
-
167
- def __setitem__(self, key: str, value: Any) -> None:
168
- self.additional_properties[key] = value
169
-
170
- def __delitem__(self, key: str) -> None:
171
- del self.additional_properties[key]
172
-
173
- def __contains__(self, key: str) -> bool:
174
- return key in self.additional_properties
@@ -1,9 +0,0 @@
1
- from enum import Enum
2
-
3
-
4
- class EmbeddingModels(str, Enum):
5
- TEXT_EMBEDDING_3_LARGE = "text-embedding-3-large"
6
- TEXT_EMBEDDING_3_SMALL = "text-embedding-3-small"
7
-
8
- def __str__(self) -> str:
9
- return str(self.value)
@@ -1,238 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: everyrow
3
- Version: 0.1.0
4
- Summary: An SDK for everyrow.io: agent ops at spreadsheet scale
5
- License-File: LICENSE.txt
6
- Requires-Python: >=3.12
7
- Requires-Dist: attrs>=25.4.0
8
- Requires-Dist: pandas>=2.3.3
9
- Requires-Dist: pydantic>=2.12.5
10
- Requires-Dist: python-dotenv>=1.2.1
11
- Description-Content-Type: text/markdown
12
-
13
- # everyrow SDK
14
-
15
- The everyrow SDK provides intelligent data processing utilities powered by AI agents. Transform, dedupe, merge, rank, and screen your dataframes using natural language instructions. Whether you're deduplicating research papers, merging complex datasets, ranking organizations, or screening vendors, the SDK handles the heavy lifting by combining AI research capabilities with structured data operations.
16
-
17
- ## Installation
18
-
19
- ```bash
20
- uv pip install -e .
21
- ```
22
-
23
- Or install dependencies:
24
-
25
- ```bash
26
- uv sync
27
- ```
28
-
29
- ## Requirements
30
-
31
- - Python >= 3.12
32
-
33
- ## Configuration
34
-
35
- Get an API key from https://everyrow.io and set it to get started:
36
-
37
- ```bash
38
- # Set in your environment or .env file
39
- EVERYROW_API_KEY=your_api_key_here
40
- ```
41
-
42
- ## Usage
43
-
44
- ### Quick Start
45
-
46
- ```python
47
- from everyrow import create_session
48
- from everyrow.ops import dedupe
49
- from pandas import DataFrame
50
-
51
- async with create_session() as session:
52
- data = DataFrame([...])
53
- result = await dedupe(
54
- session=session,
55
- input=data,
56
- equivalence_relation="Two items are duplicates if...",
57
- )
58
- print(result.data)
59
- ```
60
-
61
- ### Core Utilities
62
-
63
- #### Rank: `rank`
64
-
65
- Extract and rank rows based on AI-generated scores:
66
-
67
- ```python
68
- from everyrow.ops import rank
69
-
70
- result = await rank(
71
- session=session,
72
- task="Score this organization by their contribution to AI research",
73
- input=dataframe,
74
- field_name="contribution_score",
75
- ascending_order=False,
76
- )
77
- ```
78
-
79
- #### Dedupe: `dedupe`
80
-
81
- Intelligently deduplicate your data using AI-powered equivalence detection:
82
-
83
- ```python
84
- from everyrow.ops import dedupe
85
-
86
- result = await dedupe(
87
- session=session,
88
- input=dataframe,
89
- equivalence_relation="Two entries are duplicates if they represent the same research work",
90
- )
91
- ```
92
-
93
- #### Merge: `merge`
94
-
95
- Merge two tables using AI to match related rows:
96
-
97
- ```python
98
- from everyrow.ops import merge
99
-
100
- result = await merge(
101
- session=session,
102
- task="Match clinical trial sponsors with parent companies",
103
- left_table=trial_data,
104
- right_table=company_data,
105
- merge_on_left="sponsor",
106
- merge_on_right="company",
107
- )
108
- ```
109
-
110
- #### Screen: `screen`
111
-
112
- Evaluate and filter rows based on criteria that require research:
113
-
114
- ```python
115
- from everyrow.ops import screen
116
- from pydantic import BaseModel
117
-
118
- class Assessment(BaseModel):
119
- risk_level: str
120
- recommendation: str
121
-
122
- result = await screen(
123
- session=session,
124
- task="Evaluate vendor security and financial stability",
125
- input=vendors,
126
- response_model=Assessment,
127
- )
128
- ```
129
-
130
- ### Viewing Sessions
131
-
132
- Every session has a web interface URL:
133
-
134
- ```python
135
- async with create_session(name="My Session") as session:
136
- print(f"View session at: {session.get_url()}")
137
- # ... use session for operations
138
- ```
139
-
140
- ### Agent Tasks
141
-
142
- For single-input tasks, use `single_agent`:
143
-
144
- ```python
145
- from everyrow.ops import single_agent
146
- from pydantic import BaseModel
147
-
148
- class Input(BaseModel):
149
- country: str
150
-
151
- result = await single_agent(
152
- session=session,
153
- task="What is the capital of the given country?",
154
- input=Input(country="India"),
155
- )
156
- ```
157
-
158
- For batch processing, use `agent_map`:
159
-
160
- ```python
161
- from everyrow.ops import agent_map
162
-
163
- result = await agent_map(
164
- session=session,
165
- task="What is the capital of the given country?",
166
- input=DataFrame([{"country": "India"}, {"country": "USA"}]),
167
- )
168
- ```
169
-
170
- ### Async Operations
171
-
172
- All utilities have async variants for background processing:
173
-
174
- ```python
175
- from everyrow.ops import rank_async
176
-
177
- task = await rank_async(
178
- session=session,
179
- task="Score this organization",
180
- input=dataframe,
181
- field_name="score",
182
- )
183
-
184
- # Continue with other work...
185
-
186
- result = await task.await_result(session.client)
187
- ```
188
-
189
- ## Case Studies
190
-
191
- The `case_studies/` directory contains example workflows demonstrating real-world usage of the SDK. To run case studies, install the optional dependencies:
192
-
193
- ```bash
194
- uv sync --group case-studies
195
- ```
196
-
197
- Then you can run the case study scripts or open the Jupyter notebooks in your preferred environment.
198
-
199
- ## Development
200
-
201
- ### Setup
202
-
203
- ```bash
204
- uv sync
205
- lefthook install
206
- ```
207
-
208
- ### Running Tests
209
-
210
- ```bash
211
- uv run pytest
212
- ```
213
-
214
- ### Linting & Formatting
215
-
216
- ```bash
217
- uv run ruff check .
218
- uv run ruff check --fix .
219
- uv run ruff format .
220
- ```
221
-
222
- ### Type Checking
223
-
224
- ```bash
225
- uv run basedpyright
226
- ```
227
-
228
- ### Generating OpenAPI Client
229
-
230
- ```bash
231
- ./generate_openapi.sh
232
- ```
233
-
234
- Note: The `everyrow/generated/` directory is excluded from linting as it contains auto-generated code.
235
-
236
- ## License
237
-
238
- This project is licensed under the MIT License - see LICENSE.txt file for details.