arize-phoenix 3.19.4__py3-none-any.whl → 3.20.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arize-phoenix might be problematic. Click here for more details.
- {arize_phoenix-3.19.4.dist-info → arize_phoenix-3.20.0.dist-info}/METADATA +1 -1
- {arize_phoenix-3.19.4.dist-info → arize_phoenix-3.20.0.dist-info}/RECORD +23 -18
- phoenix/__init__.py +7 -3
- phoenix/core/model.py +8 -6
- phoenix/core/model_schema_adapter.py +6 -6
- phoenix/datasets/dataset.py +9 -521
- phoenix/datasets/fixtures.py +16 -552
- phoenix/datasets/schema.py +24 -145
- phoenix/inferences/__init__.py +0 -0
- phoenix/inferences/fixtures.py +560 -0
- phoenix/inferences/inferences.py +525 -0
- phoenix/inferences/schema.py +151 -0
- phoenix/server/app.py +5 -0
- phoenix/server/main.py +8 -8
- phoenix/session/evaluation.py +1 -2
- phoenix/session/session.py +16 -16
- phoenix/utilities/deprecation.py +30 -0
- phoenix/version.py +1 -1
- {arize_phoenix-3.19.4.dist-info → arize_phoenix-3.20.0.dist-info}/WHEEL +0 -0
- {arize_phoenix-3.19.4.dist-info → arize_phoenix-3.20.0.dist-info}/licenses/IP_NOTICE +0 -0
- {arize_phoenix-3.19.4.dist-info → arize_phoenix-3.20.0.dist-info}/licenses/LICENSE +0 -0
- /phoenix/{datasets → inferences}/errors.py +0 -0
- /phoenix/{datasets → inferences}/validation.py +0 -0
|
@@ -0,0 +1,560 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from dataclasses import dataclass, replace
|
|
4
|
+
from enum import Enum, auto
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Iterator, NamedTuple, Optional, Tuple
|
|
7
|
+
from urllib import request
|
|
8
|
+
from urllib.parse import quote, urljoin
|
|
9
|
+
|
|
10
|
+
from pandas import read_parquet
|
|
11
|
+
|
|
12
|
+
from phoenix.config import DATASET_DIR
|
|
13
|
+
from phoenix.inferences.inferences import Inferences
|
|
14
|
+
from phoenix.inferences.schema import (
|
|
15
|
+
EmbeddingColumnNames,
|
|
16
|
+
RetrievalEmbeddingColumnNames,
|
|
17
|
+
Schema,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DatasetRole(Enum):
|
|
24
|
+
PRIMARY = auto()
|
|
25
|
+
REFERENCE = auto()
|
|
26
|
+
CORPUS = auto()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(frozen=True)
|
|
30
|
+
class Fixture:
|
|
31
|
+
name: str
|
|
32
|
+
description: str
|
|
33
|
+
prefix: str
|
|
34
|
+
primary_file_name: str
|
|
35
|
+
primary_schema: Schema
|
|
36
|
+
reference_file_name: Optional[str] = None
|
|
37
|
+
# The schema for the reference dataset. If not provided, the primary schema will be used
|
|
38
|
+
reference_schema: Optional[Schema] = None
|
|
39
|
+
corpus_file_name: Optional[str] = None
|
|
40
|
+
corpus_schema: Optional[Schema] = None
|
|
41
|
+
|
|
42
|
+
def paths(self) -> Iterator[Tuple[DatasetRole, Path]]:
|
|
43
|
+
return (
|
|
44
|
+
(role, Path(self.prefix) / name)
|
|
45
|
+
for role, name in zip(
|
|
46
|
+
DatasetRole,
|
|
47
|
+
(
|
|
48
|
+
self.primary_file_name,
|
|
49
|
+
self.reference_file_name,
|
|
50
|
+
self.corpus_file_name,
|
|
51
|
+
),
|
|
52
|
+
)
|
|
53
|
+
if name
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
sentiment_classification_language_drift_schema = Schema(
|
|
58
|
+
prediction_id_column_name="prediction_id",
|
|
59
|
+
timestamp_column_name="prediction_ts",
|
|
60
|
+
prediction_label_column_name="pred_label",
|
|
61
|
+
actual_label_column_name="label",
|
|
62
|
+
embedding_feature_column_names={
|
|
63
|
+
"text_embedding": EmbeddingColumnNames(
|
|
64
|
+
vector_column_name="text_vector", raw_data_column_name="text"
|
|
65
|
+
),
|
|
66
|
+
},
|
|
67
|
+
)
|
|
68
|
+
sentiment_classification_language_drift_fixture = Fixture(
|
|
69
|
+
name="sentiment_classification_language_drift",
|
|
70
|
+
description="""
|
|
71
|
+
Highlights issues that occur maintaining a sentiment classification model.
|
|
72
|
+
This model takes online reviews of your U.S.-based product as the input and
|
|
73
|
+
predicts whether the reviewer's sentiment was positive, negative, or
|
|
74
|
+
neutral.
|
|
75
|
+
|
|
76
|
+
You trained your sentiment classification model on English reviews. However,
|
|
77
|
+
once the model was released into production, you notice that the performance
|
|
78
|
+
of the model has degraded over a period of time.
|
|
79
|
+
|
|
80
|
+
Phoenix is able to surface the reason for this performance degradation. In
|
|
81
|
+
this example, the presence of reviews written in Spanish impact the model's
|
|
82
|
+
performance. You can surface and troubleshoot this issue by analyzing the
|
|
83
|
+
embedding vectors associated with the online review text.
|
|
84
|
+
""",
|
|
85
|
+
primary_schema=sentiment_classification_language_drift_schema,
|
|
86
|
+
reference_schema=sentiment_classification_language_drift_schema,
|
|
87
|
+
prefix="unstructured/nlp/sentiment-classification-language-drift",
|
|
88
|
+
primary_file_name="sentiment_classification_language_drift_production.parquet",
|
|
89
|
+
reference_file_name="sentiment_classification_language_drift_training.parquet",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
image_classification_schema = Schema(
|
|
93
|
+
timestamp_column_name="prediction_ts",
|
|
94
|
+
prediction_label_column_name="predicted_action",
|
|
95
|
+
actual_label_column_name="actual_action",
|
|
96
|
+
embedding_feature_column_names={
|
|
97
|
+
"image_embedding": EmbeddingColumnNames(
|
|
98
|
+
vector_column_name="image_vector",
|
|
99
|
+
link_to_data_column_name="url",
|
|
100
|
+
),
|
|
101
|
+
},
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
image_classification_fixture = Fixture(
|
|
105
|
+
name="image_classification",
|
|
106
|
+
description="""
|
|
107
|
+
Imagine you're in charge of maintaining a model that classifies the action
|
|
108
|
+
of people in photographs. Your model initially performs well in production,
|
|
109
|
+
but its performance gradually degrades over time.
|
|
110
|
+
""",
|
|
111
|
+
primary_schema=replace(image_classification_schema, actual_label_column_name=None),
|
|
112
|
+
reference_schema=image_classification_schema,
|
|
113
|
+
prefix="unstructured/cv/human-actions",
|
|
114
|
+
primary_file_name="human_actions_production.parquet",
|
|
115
|
+
reference_file_name="human_actions_training.parquet",
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
fashion_mnist_primary_schema = Schema(
|
|
119
|
+
timestamp_column_name="prediction_ts",
|
|
120
|
+
embedding_feature_column_names={
|
|
121
|
+
"embedding": EmbeddingColumnNames(
|
|
122
|
+
vector_column_name="embeddings", link_to_data_column_name="image_url"
|
|
123
|
+
),
|
|
124
|
+
},
|
|
125
|
+
actual_label_column_name="actual_label",
|
|
126
|
+
prediction_label_column_name="predicted_label",
|
|
127
|
+
)
|
|
128
|
+
fashion_mnist_reference_schema = replace(fashion_mnist_primary_schema, timestamp_column_name=None)
|
|
129
|
+
fashion_mnist_fixture = Fixture(
|
|
130
|
+
name="fashion_mnist",
|
|
131
|
+
description="""
|
|
132
|
+
Fashion-MNIST is a dataset of Zalando's article images consisting of a
|
|
133
|
+
training set of 60,000 examples and a test set of 10,000 examples. Each
|
|
134
|
+
example is a 28x28 grayscale image, associated with a label from 10 classes.
|
|
135
|
+
Fashion-MNIST serves as a direct drop-in replacement for the original MNIST
|
|
136
|
+
dataset for benchmarking machine learning algorithms. It shares the same
|
|
137
|
+
image size and structure of training and testing splits.
|
|
138
|
+
""",
|
|
139
|
+
primary_schema=fashion_mnist_primary_schema,
|
|
140
|
+
reference_schema=fashion_mnist_reference_schema,
|
|
141
|
+
prefix="unstructured/cv/fashion-mnist",
|
|
142
|
+
primary_file_name="fashion_mnist_production.parquet",
|
|
143
|
+
reference_file_name="fashion_mnist_train.parquet",
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
ner_token_drift_schema = Schema(
|
|
147
|
+
timestamp_column_name="prediction_ts",
|
|
148
|
+
feature_column_names=["language"],
|
|
149
|
+
actual_label_column_name="label",
|
|
150
|
+
prediction_label_column_name="pred_label",
|
|
151
|
+
embedding_feature_column_names={
|
|
152
|
+
"embedding": EmbeddingColumnNames(
|
|
153
|
+
vector_column_name="token_vector", raw_data_column_name="text"
|
|
154
|
+
)
|
|
155
|
+
},
|
|
156
|
+
)
|
|
157
|
+
ner_token_drift_fixture = Fixture(
|
|
158
|
+
name="ner_token_drift",
|
|
159
|
+
description="""
|
|
160
|
+
You are in charge of maintaining a Named Entity Recognition (NER) model.
|
|
161
|
+
This simple model can automatically scan text, pull out some fundamental
|
|
162
|
+
entities within it, and classify them into predefined categories: Person,
|
|
163
|
+
Location, or Organization. However, once the model was released into
|
|
164
|
+
production, you notice that the performance of the model has degraded over a
|
|
165
|
+
period of time.
|
|
166
|
+
|
|
167
|
+
Phoenix is able to surface the reason for this performance degradation. In
|
|
168
|
+
this example, text including locations is under-represented in the training
|
|
169
|
+
set. This label imbalance impacts the model's performance. You can surface
|
|
170
|
+
and troubleshoot this issue by analyzing the embedding vectors associated
|
|
171
|
+
with the input text.
|
|
172
|
+
""",
|
|
173
|
+
primary_schema=ner_token_drift_schema,
|
|
174
|
+
reference_schema=ner_token_drift_schema,
|
|
175
|
+
prefix="unstructured/nlp/named-entity-recognition",
|
|
176
|
+
primary_file_name="ner_token_drift_production.parquet",
|
|
177
|
+
reference_file_name="ner_token_drift_train.parquet",
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
credit_card_fraud_schema = Schema(
|
|
181
|
+
prediction_id_column_name="prediction_id",
|
|
182
|
+
prediction_label_column_name="predicted_label",
|
|
183
|
+
prediction_score_column_name="predicted_score",
|
|
184
|
+
actual_label_column_name="actual_label",
|
|
185
|
+
timestamp_column_name="prediction_timestamp",
|
|
186
|
+
tag_column_names=["age"],
|
|
187
|
+
embedding_feature_column_names={
|
|
188
|
+
"tabular_embedding": EmbeddingColumnNames(vector_column_name="tabular_vector"),
|
|
189
|
+
},
|
|
190
|
+
)
|
|
191
|
+
credit_card_fraud_fixture = Fixture(
|
|
192
|
+
name="credit_card_fraud",
|
|
193
|
+
description="""
|
|
194
|
+
Use-case for a credit card fraud detection model at a large bank or payment
|
|
195
|
+
processing company.
|
|
196
|
+
|
|
197
|
+
You have been alerted by a spike in credit card chargebacks leading you to
|
|
198
|
+
suspect that fraudsters are getting away with committing fraud undetected!
|
|
199
|
+
|
|
200
|
+
Realizing that this flaw in your model's performance has a heavy cost on
|
|
201
|
+
your company and customers, you understand the need for a powerful tools to
|
|
202
|
+
troubleshoot and prevent costly model degradations. You turn to Phoenix to
|
|
203
|
+
find out what changed in your credit card fraud detection model and how you
|
|
204
|
+
can improve it.
|
|
205
|
+
""",
|
|
206
|
+
primary_schema=credit_card_fraud_schema,
|
|
207
|
+
reference_schema=credit_card_fraud_schema,
|
|
208
|
+
prefix="structured/credit-card-fraud",
|
|
209
|
+
primary_file_name="credit_card_fraud_production.parquet",
|
|
210
|
+
reference_file_name="credit_card_fraud_train.parquet",
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
click_through_rate_schema = Schema(
|
|
214
|
+
timestamp_column_name="prediction_timestamp",
|
|
215
|
+
prediction_id_column_name="prediction_id",
|
|
216
|
+
prediction_label_column_name="predicted_label",
|
|
217
|
+
prediction_score_column_name="predicted_score",
|
|
218
|
+
actual_label_column_name="actual_label",
|
|
219
|
+
)
|
|
220
|
+
click_through_rate_fixture = Fixture(
|
|
221
|
+
name="click_through_rate",
|
|
222
|
+
description="""
|
|
223
|
+
Investigate various performance related aspects of an online advertisement
|
|
224
|
+
use-case. These datasets are designed for analyzing Click-through Rate (CTR)
|
|
225
|
+
performance.
|
|
226
|
+
|
|
227
|
+
You manage the models for an online advertising platform. You have spent a
|
|
228
|
+
great deal of your time collecting online data and training models for best
|
|
229
|
+
performance. With your models now in production you have no tools available
|
|
230
|
+
to your disposal to monitor the performance of your models, identify any
|
|
231
|
+
issues, or get insights into how to improve your models.
|
|
232
|
+
|
|
233
|
+
This use-case highlights a common advertisement use-case and is tailored for
|
|
234
|
+
analyzing CTR for an ad or ad group.
|
|
235
|
+
""",
|
|
236
|
+
primary_schema=click_through_rate_schema,
|
|
237
|
+
reference_schema=click_through_rate_schema,
|
|
238
|
+
prefix="structured/click-through-rate",
|
|
239
|
+
primary_file_name="click_through_rate_production.parquet",
|
|
240
|
+
reference_file_name="click_through_rate_train.parquet",
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
chatbot_queries_schema = Schema(
|
|
244
|
+
prediction_id_column_name="id",
|
|
245
|
+
prompt_column_names=RetrievalEmbeddingColumnNames(
|
|
246
|
+
vector_column_name="prompt",
|
|
247
|
+
raw_data_column_name="prompt_text",
|
|
248
|
+
context_retrieval_ids_column_name="document_ids",
|
|
249
|
+
context_retrieval_scores_column_name="document_scores",
|
|
250
|
+
),
|
|
251
|
+
response_column_names="response",
|
|
252
|
+
tag_column_names=[
|
|
253
|
+
"answer_relevancy",
|
|
254
|
+
"context_relevancy",
|
|
255
|
+
"faithfulness",
|
|
256
|
+
"document_similarity_0",
|
|
257
|
+
"document_similarity_1",
|
|
258
|
+
"openai_relevance_0",
|
|
259
|
+
"openai_relevance_1",
|
|
260
|
+
"user_feedback",
|
|
261
|
+
],
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
chatbot_database_schema = Schema(
|
|
265
|
+
prediction_id_column_name="document_id",
|
|
266
|
+
prompt_column_names=EmbeddingColumnNames(
|
|
267
|
+
vector_column_name="text_vector",
|
|
268
|
+
raw_data_column_name="text",
|
|
269
|
+
),
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
chatbot_fixture = Fixture(
|
|
273
|
+
name="chatbot",
|
|
274
|
+
description="""
|
|
275
|
+
Investigate RAG performance for a chatbot built on top of Arize's documentation.
|
|
276
|
+
This use-case highlights how embedding visualizations for a RAG application can
|
|
277
|
+
highlight issues with the application's retrieval and performance.
|
|
278
|
+
|
|
279
|
+
The data contains relevance metrics generated by LLM Evals as well as RAGAS.
|
|
280
|
+
""",
|
|
281
|
+
primary_schema=chatbot_queries_schema,
|
|
282
|
+
corpus_schema=chatbot_database_schema,
|
|
283
|
+
prefix="unstructured/llm/chatbot",
|
|
284
|
+
primary_file_name="chatbot_queries_with_ragas.parquet",
|
|
285
|
+
corpus_file_name="chatbot_database_ds.parquet",
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
wide_data_primary_schema = Schema(
|
|
289
|
+
actual_label_column_name="actual_label",
|
|
290
|
+
prediction_label_column_name="predicted_label",
|
|
291
|
+
timestamp_column_name="prediction_ts",
|
|
292
|
+
)
|
|
293
|
+
wide_data_reference_schema = replace(wide_data_primary_schema, timestamp_column_name=None)
|
|
294
|
+
wide_data_fixture = Fixture(
|
|
295
|
+
name="wide_data",
|
|
296
|
+
description="""
|
|
297
|
+
Use-case that for a wide data model (e.g. a large amount of features). For
|
|
298
|
+
developer use only.
|
|
299
|
+
""",
|
|
300
|
+
primary_schema=wide_data_primary_schema,
|
|
301
|
+
reference_schema=wide_data_reference_schema,
|
|
302
|
+
prefix="structured/wide-data",
|
|
303
|
+
primary_file_name="wide_data_production.parquet",
|
|
304
|
+
reference_file_name="wide_data_train.parquet",
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
deep_data_primary_schema = Schema(
|
|
308
|
+
timestamp_column_name="prediction_ts",
|
|
309
|
+
actual_label_column_name="actual_label",
|
|
310
|
+
prediction_label_column_name="predicted_label",
|
|
311
|
+
)
|
|
312
|
+
deep_data_reference_schema = replace(deep_data_primary_schema, timestamp_column_name=None)
|
|
313
|
+
deep_data_fixture = Fixture(
|
|
314
|
+
name="deep_data",
|
|
315
|
+
description="""
|
|
316
|
+
Use-case that for a deep data model (e.g. a lot of feature values).
|
|
317
|
+
For developer use only.
|
|
318
|
+
""",
|
|
319
|
+
primary_schema=deep_data_primary_schema,
|
|
320
|
+
reference_schema=deep_data_reference_schema,
|
|
321
|
+
prefix="structured/deep-data",
|
|
322
|
+
primary_file_name="deep_data_production.parquet",
|
|
323
|
+
reference_file_name="deep_data_train.parquet",
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
llm_summarization_schema = Schema(
|
|
328
|
+
timestamp_column_name="prediction_timestamp",
|
|
329
|
+
tag_column_names=[
|
|
330
|
+
"rougeL_score",
|
|
331
|
+
"reference_summary",
|
|
332
|
+
],
|
|
333
|
+
prompt_column_names=EmbeddingColumnNames(
|
|
334
|
+
vector_column_name="article_vector", raw_data_column_name="article"
|
|
335
|
+
),
|
|
336
|
+
response_column_names=EmbeddingColumnNames(
|
|
337
|
+
vector_column_name="summary_vector", raw_data_column_name="summary"
|
|
338
|
+
),
|
|
339
|
+
)
|
|
340
|
+
llm_summarization_fixture = Fixture(
|
|
341
|
+
name="llm_summarization",
|
|
342
|
+
description="""
|
|
343
|
+
LLM summarization data.
|
|
344
|
+
""",
|
|
345
|
+
primary_schema=llm_summarization_schema,
|
|
346
|
+
reference_schema=llm_summarization_schema,
|
|
347
|
+
prefix="unstructured/llm/summarization",
|
|
348
|
+
primary_file_name="llm_summarization_prod.parquet",
|
|
349
|
+
reference_file_name="llm_summarization_baseline.parquet",
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
wikipedia_fixture = Fixture(
|
|
353
|
+
name="wiki",
|
|
354
|
+
description="""
|
|
355
|
+
Semantic search dataset including queries, answers, retrievals and corpus
|
|
356
|
+
documents. Queries are sampled from Google’s Natural Questions dataset
|
|
357
|
+
https://ai.google.com/research/NaturalQuestions, and documents (paragraphs)
|
|
358
|
+
are sampled from http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz,
|
|
359
|
+
based on the Simple English Wikipedia. Embeddings for both questions and
|
|
360
|
+
documents are generated from the bi-encoder 'multi-qa-MiniLM-L6-cos-v1',
|
|
361
|
+
which produces normalized vectors that are 384-dimensional. Given a query
|
|
362
|
+
and its embedding, a small subset of documents are first retrieved based on
|
|
363
|
+
cosine similarity search on the document embeddings, then relevance scores
|
|
364
|
+
are computed on the retrieved documents using the cross-encoder
|
|
365
|
+
'cross-encoder/ms-marco-MiniLM-L-12-v2'. Answers (to the queries) are
|
|
366
|
+
generated from 'deepset/tinyroberta-squad2' using the most relevant
|
|
367
|
+
documents as the context input.
|
|
368
|
+
|
|
369
|
+
References:
|
|
370
|
+
- https://www.sbert.net/examples/applications/semantic-search/README.html
|
|
371
|
+
- https://github.com/UKPLab/sentence-transformers/tree/master/examples/applications/retrieve_rerank
|
|
372
|
+
- https://www.sbert.net/docs/pretrained_models.html
|
|
373
|
+
- https://www.sbert.net/docs/pretrained_cross-encoders.html
|
|
374
|
+
- https://ai.google.com/research/NaturalQuestions
|
|
375
|
+
""", # noqa: E501
|
|
376
|
+
primary_schema=Schema(
|
|
377
|
+
prediction_id_column_name="id",
|
|
378
|
+
prompt_column_names=RetrievalEmbeddingColumnNames(
|
|
379
|
+
vector_column_name="embedding",
|
|
380
|
+
raw_data_column_name="question",
|
|
381
|
+
context_retrieval_ids_column_name="retrievals",
|
|
382
|
+
context_retrieval_scores_column_name="scores",
|
|
383
|
+
),
|
|
384
|
+
response_column_names="answer",
|
|
385
|
+
),
|
|
386
|
+
corpus_schema=Schema(
|
|
387
|
+
id_column_name="id",
|
|
388
|
+
document_column_names=EmbeddingColumnNames(
|
|
389
|
+
vector_column_name="embedding",
|
|
390
|
+
raw_data_column_name="text",
|
|
391
|
+
),
|
|
392
|
+
),
|
|
393
|
+
prefix="unstructured/search/wiki",
|
|
394
|
+
primary_file_name="queries.parquet",
|
|
395
|
+
# The reference file is intended purely for troubleshooting bad data. Un-comment for testing.
|
|
396
|
+
# reference_file_name="queries2.parquet",
|
|
397
|
+
corpus_file_name="corpus.parquet",
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
FIXTURES: Tuple[Fixture, ...] = (
|
|
401
|
+
sentiment_classification_language_drift_fixture,
|
|
402
|
+
image_classification_fixture,
|
|
403
|
+
fashion_mnist_fixture,
|
|
404
|
+
ner_token_drift_fixture,
|
|
405
|
+
credit_card_fraud_fixture,
|
|
406
|
+
click_through_rate_fixture,
|
|
407
|
+
wide_data_fixture,
|
|
408
|
+
deep_data_fixture,
|
|
409
|
+
llm_summarization_fixture,
|
|
410
|
+
wikipedia_fixture,
|
|
411
|
+
chatbot_fixture,
|
|
412
|
+
)
|
|
413
|
+
NAME_TO_FIXTURE = {fixture.name: fixture for fixture in FIXTURES}
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def get_datasets(
|
|
417
|
+
fixture_name: str,
|
|
418
|
+
no_internet: bool = False,
|
|
419
|
+
) -> Tuple[Inferences, Optional[Inferences], Optional[Inferences]]:
|
|
420
|
+
"""
|
|
421
|
+
Downloads primary and reference datasets for a fixture if they are not found
|
|
422
|
+
locally.
|
|
423
|
+
"""
|
|
424
|
+
fixture = _get_fixture_by_name(fixture_name=fixture_name)
|
|
425
|
+
if no_internet:
|
|
426
|
+
paths = {role: DATASET_DIR / path for role, path in fixture.paths()}
|
|
427
|
+
else:
|
|
428
|
+
paths = dict(_download(fixture, DATASET_DIR))
|
|
429
|
+
primary_dataset = Inferences(
|
|
430
|
+
read_parquet(paths[DatasetRole.PRIMARY]),
|
|
431
|
+
fixture.primary_schema,
|
|
432
|
+
"production",
|
|
433
|
+
)
|
|
434
|
+
reference_dataset = None
|
|
435
|
+
if fixture.reference_file_name is not None:
|
|
436
|
+
reference_dataset = Inferences(
|
|
437
|
+
read_parquet(paths[DatasetRole.REFERENCE]),
|
|
438
|
+
fixture.reference_schema
|
|
439
|
+
if fixture.reference_schema is not None
|
|
440
|
+
else fixture.primary_schema,
|
|
441
|
+
"training",
|
|
442
|
+
)
|
|
443
|
+
corpus_dataset = None
|
|
444
|
+
if fixture.corpus_file_name is not None:
|
|
445
|
+
corpus_dataset = Inferences(
|
|
446
|
+
read_parquet(paths[DatasetRole.CORPUS]),
|
|
447
|
+
fixture.corpus_schema,
|
|
448
|
+
"knowledge_base",
|
|
449
|
+
)
|
|
450
|
+
return primary_dataset, reference_dataset, corpus_dataset
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def _get_fixture_by_name(fixture_name: str) -> Fixture:
|
|
454
|
+
"""
|
|
455
|
+
Returns the fixture whose name matches the input name. Raises a ValueError
|
|
456
|
+
if the input fixture name does not match any known fixture names.
|
|
457
|
+
"""
|
|
458
|
+
if fixture_name not in NAME_TO_FIXTURE:
|
|
459
|
+
valid_fixture_names = ", ".join(NAME_TO_FIXTURE.keys())
|
|
460
|
+
raise ValueError(f'"{fixture_name}" is invalid. Valid names are: {valid_fixture_names}')
|
|
461
|
+
return NAME_TO_FIXTURE[fixture_name]
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
@dataclass
|
|
465
|
+
class ExampleInferences:
|
|
466
|
+
"""
|
|
467
|
+
A primary and optional reference dataset pair.
|
|
468
|
+
"""
|
|
469
|
+
|
|
470
|
+
primary: Inferences
|
|
471
|
+
reference: Optional[Inferences] = None
|
|
472
|
+
corpus: Optional[Inferences] = None
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def load_example(use_case: str) -> ExampleInferences:
|
|
476
|
+
"""
|
|
477
|
+
Loads an example primary and reference dataset for a given use-case.
|
|
478
|
+
|
|
479
|
+
Parameters
|
|
480
|
+
----------
|
|
481
|
+
use_case: str
|
|
482
|
+
Name of the phoenix supported use case Valid values include:
|
|
483
|
+
- "sentiment_classification_language_drift"
|
|
484
|
+
- "image_classification"
|
|
485
|
+
- "fashion_mnist"
|
|
486
|
+
- "ner_token_drift"
|
|
487
|
+
- "credit_card_fraud"
|
|
488
|
+
- "click_through_rate"
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
Returns
|
|
492
|
+
_______
|
|
493
|
+
datasets: DatasetDict
|
|
494
|
+
A dictionary of datasets, split out by dataset type (primary,
|
|
495
|
+
reference).
|
|
496
|
+
|
|
497
|
+
"""
|
|
498
|
+
fixture = _get_fixture_by_name(use_case)
|
|
499
|
+
primary_dataset, reference_dataset, corpus_dataset = get_datasets(use_case)
|
|
500
|
+
print(f"📥 Loaded {use_case} example datasets.")
|
|
501
|
+
print("ℹ️ About this use-case:")
|
|
502
|
+
print(fixture.description)
|
|
503
|
+
return ExampleInferences(
|
|
504
|
+
primary=primary_dataset,
|
|
505
|
+
reference=reference_dataset,
|
|
506
|
+
corpus=corpus_dataset,
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
class Metadata(NamedTuple):
|
|
511
|
+
path: str
|
|
512
|
+
mediaLink: str
|
|
513
|
+
md5Hash: str
|
|
514
|
+
|
|
515
|
+
def save_artifact(self, location: Path) -> Path:
|
|
516
|
+
data_file_path = location / self.path
|
|
517
|
+
md5_file = data_file_path.with_name(data_file_path.stem + ".md5")
|
|
518
|
+
data_file_path.parents[0].mkdir(parents=True, exist_ok=True)
|
|
519
|
+
if data_file_path.is_file() and md5_file.is_file():
|
|
520
|
+
with open(md5_file, "r") as f:
|
|
521
|
+
if f.readline() == self.md5Hash:
|
|
522
|
+
return data_file_path
|
|
523
|
+
request.urlretrieve(self.mediaLink, data_file_path)
|
|
524
|
+
with open(md5_file, "w") as f:
|
|
525
|
+
f.write(self.md5Hash)
|
|
526
|
+
return data_file_path
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
class GCSAssets(NamedTuple):
|
|
530
|
+
host: str = "https://storage.googleapis.com/"
|
|
531
|
+
bucket: str = "arize-assets"
|
|
532
|
+
prefix: str = "phoenix/datasets/"
|
|
533
|
+
|
|
534
|
+
def metadata(self, path: Path) -> Metadata:
|
|
535
|
+
url = urljoin(
|
|
536
|
+
urljoin(self.host, f"storage/v1/b/{self.bucket}/o/"),
|
|
537
|
+
quote(urljoin(self.prefix, str(path)), safe=""),
|
|
538
|
+
)
|
|
539
|
+
resp = json.loads(request.urlopen(request.Request(url)).read())
|
|
540
|
+
return Metadata(
|
|
541
|
+
resp["name"][len(self.prefix) :],
|
|
542
|
+
resp["mediaLink"],
|
|
543
|
+
resp["md5Hash"],
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
def _download(fixture: Fixture, location: Path) -> Iterator[Tuple[DatasetRole, Path]]:
|
|
548
|
+
for role, path in fixture.paths():
|
|
549
|
+
yield role, GCSAssets().metadata(path).save_artifact(location)
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
# Download all fixtures
|
|
553
|
+
if __name__ == "__main__":
|
|
554
|
+
import time
|
|
555
|
+
|
|
556
|
+
for fixture in FIXTURES:
|
|
557
|
+
start_time = time.time()
|
|
558
|
+
print(f"getting {fixture.name}", end="...")
|
|
559
|
+
dict(_download(fixture, DATASET_DIR))
|
|
560
|
+
print(f"done ({time.time() - start_time:.2f}s)")
|