sturdy-stats-sdk 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ Metadata-Version: 2.1
2
+ Name: sturdy-stats-sdk
3
+ Version: 1.0.0
4
+ Summary: SDK for the Sturdy Statistics API
5
+ Home-page: https://sturdystatistics.com/api/documentation
6
+ Author: Kian Ghodoussi
7
+ Author-email: ghodoussikian@gmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Requires-Python: >=3.9
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: more-itertools
12
+ Requires-Dist: srsly
13
+
14
+ # Sturdy Stats SDK
@@ -0,0 +1 @@
1
+ # Sturdy Stats SDK
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,21 @@
1
+ import setuptools
2
+
3
+ with open("README.md", "r") as fh:
4
+ long_description = fh.read()
5
+
6
+ setuptools.setup(
7
+ name="sturdy-stats-sdk",
8
+ version="1.0.0",
9
+ author="Kian Ghodoussi",
10
+ author_email="ghodoussikian@gmail.com",
11
+ description="SDK for the Sturdy Statistics API",
12
+ long_description=long_description,
13
+ long_description_content_type="text/markdown",
14
+ url="https://sturdystatistics.com/api/documentation",
15
+ packages=setuptools.find_packages(),
16
+ classifiers=[
17
+ "Programming Language :: Python :: 3",
18
+ ],
19
+ python_requires='>=3.9',
20
+ install_requires=['more-itertools', 'srsly'],
21
+ )
@@ -0,0 +1,14 @@
1
+ Metadata-Version: 2.1
2
+ Name: sturdy-stats-sdk
3
+ Version: 1.0.0
4
+ Summary: SDK for the Sturdy Statistics API
5
+ Home-page: https://sturdystatistics.com/api/documentation
6
+ Author: Kian Ghodoussi
7
+ Author-email: ghodoussikian@gmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Requires-Python: >=3.9
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: more-itertools
12
+ Requires-Dist: srsly
13
+
14
+ # Sturdy Stats SDK
@@ -0,0 +1,10 @@
1
+ README.md
2
+ setup.py
3
+ sturdy_stats_sdk.egg-info/PKG-INFO
4
+ sturdy_stats_sdk.egg-info/SOURCES.txt
5
+ sturdy_stats_sdk.egg-info/dependency_links.txt
6
+ sturdy_stats_sdk.egg-info/requires.txt
7
+ sturdy_stats_sdk.egg-info/top_level.txt
8
+ sturdystats/__init__.py
9
+ sturdystats/index.py
10
+ sturdystats/job.py
@@ -0,0 +1,2 @@
1
+ more-itertools
2
+ srsly
@@ -0,0 +1,2 @@
1
+ from sturdystats.job import Job
2
+ from sturdystats.index import Index
@@ -0,0 +1,403 @@
1
+ import requests
2
+ from sturdystats.job import Job
3
+
4
+ import srsly # to decode output
5
+ from more_itertools import chunked # to batch data for API calls
6
+
7
+
8
+
9
+ # for type checking
10
+ from typing import Optional, Iterable, Dict
11
+ from requests.models import Response
12
+
13
+
14
+ class Index:
15
+
16
+ ## TODO support id based loading as well if already exists
17
+ def __init__(self, API_key: str, name: str):
18
+
19
+ self.API_key = API_key
20
+ self.base_url = "https://sturdystatistics.com/api/text/v1/index"
21
+ self.base_url = "http://localhost:8050/api/text/v1/index"
22
+
23
+ self.name = name
24
+ self.id = None
25
+
26
+ status = self._get_status(index_name=self.name)
27
+ if status is None:
28
+ self.id = self._create(self.name)
29
+ print(f"""Created new index with id="{self.id}".""")
30
+ else:
31
+ self.id = status["index_id"]
32
+ print(f"""Found an existing index with id="{self.id}".""")
33
+
34
+
35
+
36
+ def _check_status(self, info: Response) -> None:
37
+ if (200 != info.status_code):
38
+ print(f"""error code {info.status_code}""")
39
+ print(info.content.decode("utf-8"))
40
+ assert(200 == info.status_code)
41
+
42
+ def _post(self, url: str, params: Dict) -> Response:
43
+ payload = {"api_key": self.API_key, **params}
44
+ res = requests.post(self.base_url + url, json=payload)
45
+ self._check_status(res)
46
+ return res
47
+
48
+ def _get(self, url: str, params: Dict) -> Response:
49
+ params = {"api_key": self.API_key, **params}
50
+ res = requests.get(self.base_url + url , params=params)
51
+ self._check_status(res)
52
+ return res
53
+
54
+
55
+
56
+ def _create(self, index_name: str):
57
+ """Creates a new index. An index is the core data structure for
58
+ storing data. Once the index is trained, an index may also be
59
+ used to search, query, and analyze data. If an index with the
60
+ provided name already exists, no index will be created and the
61
+ metadata of that index will be returned.
62
+
63
+ https://sturdystatistics.com/api/documentation#tag/apitextv1/operation/createIndex
64
+
65
+ """
66
+
67
+ # Create a new index associated with this API key. Equivalent to:
68
+ #
69
+ # curl -X POST https://sturdystatistics.com/api/text/v1/index \
70
+ # -H "Content-Type: application/json" \
71
+ # -d '{
72
+ # "api_key": "API_KEY",
73
+ # "name": "INDEX_NAME"
74
+ # }'
75
+
76
+ info = self._post("", dict(name=index_name))
77
+ index_id = info.json()["index_id"]
78
+ return index_id
79
+
80
+
81
+
82
+ def _get_status_by_name(self, index_name: str):
83
+
84
+ # List all indices associated with this API key. Equivalent to:
85
+ #
86
+ # curl -X GET 'https://sturdystatistics.com/api/text/v1/index?api_key=API_KEY'
87
+ #
88
+ # https://sturdystatistics.com/api/documentation#tag/apitextv1/operation/listIndicies
89
+
90
+ info = self._get("", dict())
91
+
92
+ # find matches by name
93
+ matches = [ i for i in info.json() if i["name"] == index_name ]
94
+ if (0 == len(matches)):
95
+ return None
96
+ assert(1 == len(matches))
97
+ return matches[0]
98
+
99
+
100
+
101
+ def _get_status_by_id(self, index_id: str):
102
+
103
+ # curl -X GET 'https://sturdystatistics.com/api/text/v1/index/{index_id}?api_key=API_KEY'
104
+
105
+ info = self._get(f"/{index_id}", dict())
106
+ status = info.json()
107
+ return status
108
+
109
+
110
+
111
+ def _get_status(self,
112
+ index_name: Optional[str] = None,
113
+ index_id: Optional[str] = None):
114
+ """Look up an index by name or ID and return all metadata
115
+ associated with the index.
116
+
117
+ https://sturdystatistics.com/api/documentation#tag/apitextv1/operation/getSingleIndexInfo
118
+
119
+ """
120
+
121
+ if (index_name is None) and (index_id is None):
122
+ raise ValueError("Must provide either an index_name or an index_id.")
123
+ if (index_name is not None) and (index_id is not None):
124
+ raise ValueError("Cannot provide both an index_name and an index_id.")
125
+ if index_id is not None:
126
+ # look up by index_id:
127
+ return self._get_status_by_id(index_id)
128
+ # look up by name:
129
+ return self._get_status_by_name(index_name)
130
+
131
+ def get_status(self) -> dict:
132
+ if self.id is not None:
133
+ return self._get_status(index_id=self.id)
134
+ else:
135
+ return self._get_status(index_name=self.name)
136
+
137
+ def commit(self, wait: bool = True):
138
+ """
139
+ """
140
+ print(f"""committing changes to index "{self.id}"...""", end="")
141
+ # Commit changes from the staging index to the permanent index. Equivalent to:
142
+ #
143
+ # curl -X POST https://sturdystatistics.com/api/text/v1/index/{index_id}/doc/commit \
144
+ # -H "Content-Type: application/json" \
145
+ # -d '{
146
+ # "api_key": "API_KEY",
147
+ # }'
148
+ info = self._post(f"/{self.id}/doc/commit", dict())
149
+ job_id = info.json()["job_id"]
150
+ job = Job(self.API_key, job_id, 10)
151
+ if not wait:
152
+ return job
153
+ return job.wait()
154
+
155
+ def unstage(self, wait: bool = True):
156
+ """
157
+ """
158
+ print(f"""unstaging changes to index "{self.id}"...""", end="")
159
+ # Commit changes from the staging index to the permanent index. Equivalent to:
160
+ #
161
+ # curl -X POST https://sturdystatistics.com/api/text/v1/index/{index_id}/doc/commit \
162
+ # -H "Content-Type: application/json" \
163
+ # -d '{
164
+ # "api_key": "API_KEY",
165
+ # }'
166
+ info = self._post(f"/{self.id}/doc/unstage", dict())
167
+ job_id = info.json()["job_id"]
168
+ job = Job(self.API_key, job_id, 5)
169
+ if not wait:
170
+ return job
171
+ return job.wait()
172
+
173
+
174
+ def _upload_batch(self, records: Iterable[Dict], save = "true"):
175
+ if len(records) > 250:
176
+ raise RuntimeError(f"""The maximum batch size is 250 documents.""")
177
+ info = self._post(f"/{self.id}/doc", dict(docs=records, save=save))
178
+ job_id = info.json()["job_id"]
179
+ job = Job(self.API_key, job_id, 1)
180
+ return job.wait()
181
+
182
+
183
+ def upload(self,
184
+ records: Iterable[Dict],
185
+ batch_size: int = 200,
186
+ commit: bool = True):
187
+ """Uploads documents to the index and commit them for
188
+ permanent storage. Documents are processed by the AI model if the
189
+ index has been trained.
190
+
191
+ Documents are provided as a list of dictionaries. The content of
192
+ each document must be plain text and is provided under the
193
+ required field doc. You may provide a unique document identifier
194
+ under the optional field doc_id. If no doc_id is provided, we will
195
+ create an identifier by hashing the contents of the
196
+ document. Documents can be updated via an upsert mechanism that
197
+ matches on doc_id. If doc_id is not provided and two docs have
198
+ identical content, the most recently uploaded document will upsert
199
+ the previously uploaded document.
200
+
201
+ This is a locking operation. A client cannot call upload, train or
202
+ commit while an upload is already in progress. Consequently, the
203
+ operation is more efficient with batches of documents. The API
204
+ supports a batch size of up to 250 documents at a time. The larger
205
+ the batch size, the more efficient the upload.
206
+
207
+ https://sturdystatistics.com/api/documentation#tag/apitextv1/operation/writeDocs
208
+
209
+ """
210
+
211
+ status = self.get_status()
212
+ if "untrained" == status["state"]:
213
+ print("Uploading data to UNTRAINED index for training.")
214
+ elif "ready" == status["state"]:
215
+ print("Uploading data to TRAINED index for prediction.")
216
+ else:
217
+ raise RuntimeError(f"""Unknown status "{status['state']}" for index "{self.name}".""")
218
+ results = []
219
+ # Upload docs to the staging index. Equivalent to:
220
+ #
221
+ # curl -X POST https://sturdystatistics.com/api/text/v1/index/{index_id}/doc \
222
+ # -H "Content-Type: application/json" \
223
+ # -d '{
224
+ # "api_key": "API_KEY",
225
+ # "docs": JSON_DOC_DATA
226
+ # }'
227
+
228
+ print("uploading data to index...")
229
+ for i, batch in enumerate(chunked(records, batch_size)):
230
+ info = self._upload_batch(batch)
231
+ results.extend(info["result"]["results"])
232
+ print(f""" upload batch {1+i:4d}""")
233
+ if commit: self.commit()
234
+ return results
235
+
236
+
237
+ def train(self, params: Dict, force: bool = False, wait: bool = True):
238
+ """Trains an AI model on all documents in the production
239
+ index. Once an index has been trained, documents are queryable,
240
+ and the model automatically processes subsequently uploaded
241
+ documents.
242
+
243
+ The AI model identifies thematic information in documents, permitting
244
+ semantic indexing and semantic search. It also enables quantitative
245
+ analysis of, e.g., topic trends.
246
+
247
+ The AI model may optionally be supervised using metadata present in the
248
+ index. Thematic decomposition of the data is not unique; supervision
249
+ guides the model and aligns the identified topics to your intended
250
+ application. Supervision also allows the model to make predictions.
251
+
252
+ Data for supervision may be supplied explicitly using the
253
+ label_field_names parameter. Metadata field names listed in this
254
+ parameter must each store data in a ternary true/false/unknown format.
255
+ For convenience, supervision data may also be supplied in a sparse "tag"
256
+ format using the tag_field_names parameter. Metadata field names listed
257
+ in this parameter must contain a list of labels for each document. The
258
+ document is considered "true" for each label listed; it is implicitly
259
+ considered "false" for each label not listed. Consequently, the "tag"
260
+ format does not allow for unknown labels. Any combination of
261
+ label_field_names and tag_field_names may be supplied.
262
+
263
+ https://sturdystatistics.com/api/documentation#tag/apitextv1/operation/trainIndex
264
+
265
+ """
266
+
267
+ status = self.get_status()
268
+ if ("untrained" != status["state"]) and not force:
269
+ print(f"index {self.name} is already trained.")
270
+ return status
271
+
272
+ # Issue a training command to the index. Equivalent to:
273
+ #
274
+ # curl -X POST https://sturdystatistics.com/api/text/v1/index/{index_id}/train \
275
+ # -H "Content-Type: application/json" \
276
+ # -d '{
277
+ # "api_key": "API_KEY",
278
+ # PARAMS
279
+ # }'
280
+
281
+ info = self._post(f"/{self.id}/train", params)
282
+ job_id = info.json()["job_id"]
283
+ job = Job(self.API_key, job_id, 30)
284
+ if wait:
285
+ return job.wait()
286
+ else:
287
+ return job
288
+
289
+
290
+
291
+ def predict(self, records: Iterable[Dict], batch_size: int = 200):
292
+ """"Predict" function analogous to sklearn or keras: accepts
293
+ a batch of documents and returns their corresponding predictions.
294
+
295
+ Performs an upload operation with `save=false` and without a commit step.
296
+ This function does not mutate the index in any way.
297
+
298
+ https://sturdystatistics.com/api/documentation#tag/apitextv1/operation/writeDocs
299
+
300
+ """
301
+
302
+ status = self.get_status()
303
+
304
+ if "ready" != status["state"]:
305
+ raise RuntimeError(f"""Cannot run predictions on index "{self.name}" with state {status["state"]}.""")
306
+
307
+
308
+ results = []
309
+
310
+ # Upload docs to the staging index. Equivalent to:
311
+ #
312
+ # curl -X POST https://sturdystatistics.com/api/text/v1/index/{index_id}/doc \
313
+ # -H "Content-Type: application/json" \
314
+ # -d '{
315
+ # "api_key": "API_KEY",
316
+ # "save": "false",
317
+ # "docs": JSON_DOC_DATA
318
+ # }'
319
+
320
+ print("running predictions...")
321
+ for i, batch in enumerate(chunked(records, batch_size)):
322
+ info = self._upload_batch(records, save="false")
323
+ results.extend(info["result"]['results'])
324
+ print(f""" upload batch {1+i:4d}: response {str(info)}""")
325
+ print("...done")
326
+
327
+ # no commit needed since this makes no change to the index
328
+
329
+ return results
330
+
331
+ def query(
332
+ self,
333
+ search_query: Optional[str] = None,
334
+ topic_id: Optional[int] = None,
335
+ topic_group_id: Optional[int] = None,
336
+ filters: str = "",
337
+ offset: int = 0,
338
+ limit: int = 20,
339
+ sort_by: str = "relevance",
340
+ ascending: bool = False,
341
+ summarize_by: str = "paragraph",
342
+ ):
343
+ params = dict(
344
+ offset=offset,
345
+ limit=limit,
346
+ sort_by=sort_by,
347
+ ascending=ascending,
348
+ summarize_by=summarize_by,
349
+ filters=filters
350
+ )
351
+ if search_query is not None:
352
+ params["query"] = search_query
353
+ if topic_id is not None:
354
+ params['topic_ids'] = topic_id
355
+ if topic_group_id is not None:
356
+ params["topic_group_id"] = topic_group_id
357
+
358
+ res = self._get(f"/{self.id}/doc", params)
359
+ return res.json()
360
+
361
+ def getDocs(
362
+ self,
363
+ doc_ids: list[str]
364
+ ):
365
+ assert len(doc_ids) > 0
366
+ joined = ",".join(doc_ids)
367
+ return self._get(f"/{self.id}/doc/{joined}", dict()).json()
368
+
369
+ def topicDiff(
370
+ self,
371
+ q1: str,
372
+ q2: str = "",
373
+ limit: int = 20,
374
+ cutoff: float = 2.0,
375
+ min_confidence: float = 95,
376
+ ):
377
+ params = dict(
378
+ q1=q1,
379
+ limit=limit,
380
+ cutoff=cutoff,
381
+ min_confidence=min_confidence
382
+ )
383
+ if len(q2.strip()) > 0:
384
+ params["q2"] = q2
385
+ res = self._get(f"/{self.id}/topic/diff", params)
386
+ return res.json()
387
+
388
+ def listJobs(
389
+ self,
390
+ status: str= "RUNNING",
391
+ job_name: Optional[str] = None
392
+ ):
393
+ assert status in [None, "", "RUNNING", "FAILED", "SUCCEEDED", "PENDING"]
394
+ assert job_name in [None, "", "trainIndex", "commitIndex", "unstageIndex", "writeDocs"]
395
+ params = dict(index_id = self.id)
396
+ if status is not None and status.strip() != "":
397
+ params["status"] = status
398
+ if job_name is not None and job_name.strip() != "":
399
+ params["job_name"] = job_name
400
+
401
+ job = Job(self.API_key, "", 1)
402
+ res = job._get("", params)
403
+ return res.json()
@@ -0,0 +1,60 @@
1
+ import requests
2
+ from time import sleep
3
+ import json
4
+
5
+ import srsly
6
+
7
+ # for type checking
8
+ from typing import Dict
9
+ from requests.models import Response
10
+
11
+
12
+ class Job:
13
+ def __init__(self, API_key, job_id, poll_seconds = 1):
14
+ self.API_key = API_key
15
+ self.job_id = job_id
16
+ self.poll_seconds = poll_seconds
17
+ self.base_url = "https://sturdystatistics.com/api/text/v1/job"
18
+ self.base_url = "http://localhost:8050/api/text/v1/job"
19
+
20
+ def _check_status(self, info: Response) -> None:
21
+ if (200 != info.status_code):
22
+ print(f"""error code {info.status_code}""")
23
+ print(info.content.decode("utf-8"))
24
+ assert(200 == info.status_code)
25
+
26
+ def _post(self, url: str, params: Dict) -> Response:
27
+ payload = {"api_key": self.API_key, **params}
28
+ res = requests.post(self.base_url + url, json=payload)
29
+ self._check_status(res)
30
+ return res
31
+
32
+ def _get(self, url: str, params: Dict) -> Response:
33
+ params = {"api_key": self.API_key, **params}
34
+ res = requests.get(self.base_url + url , params=params)
35
+ self._check_status(res)
36
+ return res
37
+
38
+
39
+ def get_status(self):
40
+ res = self._get("/"+self.job_id, dict())
41
+ res = res.json()
42
+ if "result" in res:
43
+ res["result"] = json.loads(res["result"])
44
+ return res
45
+
46
+ def _is_running(self):
47
+ status = self.get_status()
48
+ return status["status"] not in ["FAILED", "SUCCEEDED"]
49
+
50
+
51
+ def wait(self):
52
+ while True:
53
+ if not self._is_running():
54
+ break
55
+ sleep(self.poll_seconds)
56
+ status = self.get_status()
57
+ if status["status"] == "FAILED":
58
+ raise Exception(f"Job {self.job_id} failed with the following error: {status['error']}")
59
+ return status
60
+