arize-phoenix 3.19.4__py3-none-any.whl → 3.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of arize-phoenix might be problematic. Click here for more details.

@@ -0,0 +1,560 @@
1
+ import json
2
+ import logging
3
+ from dataclasses import dataclass, replace
4
+ from enum import Enum, auto
5
+ from pathlib import Path
6
+ from typing import Iterator, NamedTuple, Optional, Tuple
7
+ from urllib import request
8
+ from urllib.parse import quote, urljoin
9
+
10
+ from pandas import read_parquet
11
+
12
+ from phoenix.config import DATASET_DIR
13
+ from phoenix.inferences.inferences import Inferences
14
+ from phoenix.inferences.schema import (
15
+ EmbeddingColumnNames,
16
+ RetrievalEmbeddingColumnNames,
17
+ Schema,
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class DatasetRole(Enum):
24
+ PRIMARY = auto()
25
+ REFERENCE = auto()
26
+ CORPUS = auto()
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class Fixture:
31
+ name: str
32
+ description: str
33
+ prefix: str
34
+ primary_file_name: str
35
+ primary_schema: Schema
36
+ reference_file_name: Optional[str] = None
37
+ # The schema for the reference dataset. If not provided, the primary schema will be used
38
+ reference_schema: Optional[Schema] = None
39
+ corpus_file_name: Optional[str] = None
40
+ corpus_schema: Optional[Schema] = None
41
+
42
+ def paths(self) -> Iterator[Tuple[DatasetRole, Path]]:
43
+ return (
44
+ (role, Path(self.prefix) / name)
45
+ for role, name in zip(
46
+ DatasetRole,
47
+ (
48
+ self.primary_file_name,
49
+ self.reference_file_name,
50
+ self.corpus_file_name,
51
+ ),
52
+ )
53
+ if name
54
+ )
55
+
56
+
57
+ sentiment_classification_language_drift_schema = Schema(
58
+ prediction_id_column_name="prediction_id",
59
+ timestamp_column_name="prediction_ts",
60
+ prediction_label_column_name="pred_label",
61
+ actual_label_column_name="label",
62
+ embedding_feature_column_names={
63
+ "text_embedding": EmbeddingColumnNames(
64
+ vector_column_name="text_vector", raw_data_column_name="text"
65
+ ),
66
+ },
67
+ )
68
+ sentiment_classification_language_drift_fixture = Fixture(
69
+ name="sentiment_classification_language_drift",
70
+ description="""
71
+ Highlights issues that occur maintaining a sentiment classification model.
72
+ This model takes online reviews of your U.S.-based product as the input and
73
+ predicts whether the reviewer's sentiment was positive, negative, or
74
+ neutral.
75
+
76
+ You trained your sentiment classification model on English reviews. However,
77
+ once the model was released into production, you notice that the performance
78
+ of the model has degraded over a period of time.
79
+
80
+ Phoenix is able to surface the reason for this performance degradation. In
81
+ this example, the presence of reviews written in Spanish impact the model's
82
+ performance. You can surface and troubleshoot this issue by analyzing the
83
+ embedding vectors associated with the online review text.
84
+ """,
85
+ primary_schema=sentiment_classification_language_drift_schema,
86
+ reference_schema=sentiment_classification_language_drift_schema,
87
+ prefix="unstructured/nlp/sentiment-classification-language-drift",
88
+ primary_file_name="sentiment_classification_language_drift_production.parquet",
89
+ reference_file_name="sentiment_classification_language_drift_training.parquet",
90
+ )
91
+
92
+ image_classification_schema = Schema(
93
+ timestamp_column_name="prediction_ts",
94
+ prediction_label_column_name="predicted_action",
95
+ actual_label_column_name="actual_action",
96
+ embedding_feature_column_names={
97
+ "image_embedding": EmbeddingColumnNames(
98
+ vector_column_name="image_vector",
99
+ link_to_data_column_name="url",
100
+ ),
101
+ },
102
+ )
103
+
104
+ image_classification_fixture = Fixture(
105
+ name="image_classification",
106
+ description="""
107
+ Imagine you're in charge of maintaining a model that classifies the action
108
+ of people in photographs. Your model initially performs well in production,
109
+ but its performance gradually degrades over time.
110
+ """,
111
+ primary_schema=replace(image_classification_schema, actual_label_column_name=None),
112
+ reference_schema=image_classification_schema,
113
+ prefix="unstructured/cv/human-actions",
114
+ primary_file_name="human_actions_production.parquet",
115
+ reference_file_name="human_actions_training.parquet",
116
+ )
117
+
118
+ fashion_mnist_primary_schema = Schema(
119
+ timestamp_column_name="prediction_ts",
120
+ embedding_feature_column_names={
121
+ "embedding": EmbeddingColumnNames(
122
+ vector_column_name="embeddings", link_to_data_column_name="image_url"
123
+ ),
124
+ },
125
+ actual_label_column_name="actual_label",
126
+ prediction_label_column_name="predicted_label",
127
+ )
128
+ fashion_mnist_reference_schema = replace(fashion_mnist_primary_schema, timestamp_column_name=None)
129
+ fashion_mnist_fixture = Fixture(
130
+ name="fashion_mnist",
131
+ description="""
132
+ Fashion-MNIST is a dataset of Zalando's article images consisting of a
133
+ training set of 60,000 examples and a test set of 10,000 examples. Each
134
+ example is a 28x28 grayscale image, associated with a label from 10 classes.
135
+ Fashion-MNIST serves as a direct drop-in replacement for the original MNIST
136
+ dataset for benchmarking machine learning algorithms. It shares the same
137
+ image size and structure of training and testing splits.
138
+ """,
139
+ primary_schema=fashion_mnist_primary_schema,
140
+ reference_schema=fashion_mnist_reference_schema,
141
+ prefix="unstructured/cv/fashion-mnist",
142
+ primary_file_name="fashion_mnist_production.parquet",
143
+ reference_file_name="fashion_mnist_train.parquet",
144
+ )
145
+
146
+ ner_token_drift_schema = Schema(
147
+ timestamp_column_name="prediction_ts",
148
+ feature_column_names=["language"],
149
+ actual_label_column_name="label",
150
+ prediction_label_column_name="pred_label",
151
+ embedding_feature_column_names={
152
+ "embedding": EmbeddingColumnNames(
153
+ vector_column_name="token_vector", raw_data_column_name="text"
154
+ )
155
+ },
156
+ )
157
+ ner_token_drift_fixture = Fixture(
158
+ name="ner_token_drift",
159
+ description="""
160
+ You are in charge of maintaining a Named Entity Recognition (NER) model.
161
+ This simple model can automatically scan text, pull out some fundamental
162
+ entities within it, and classify them into predefined categories: Person,
163
+ Location, or Organization. However, once the model was released into
164
+ production, you notice that the performance of the model has degraded over a
165
+ period of time.
166
+
167
+ Phoenix is able to surface the reason for this performance degradation. In
168
+ this example, text including locations is under-represented in the training
169
+ set. This label imbalance impacts the model's performance. You can surface
170
+ and troubleshoot this issue by analyzing the embedding vectors associated
171
+ with the input text.
172
+ """,
173
+ primary_schema=ner_token_drift_schema,
174
+ reference_schema=ner_token_drift_schema,
175
+ prefix="unstructured/nlp/named-entity-recognition",
176
+ primary_file_name="ner_token_drift_production.parquet",
177
+ reference_file_name="ner_token_drift_train.parquet",
178
+ )
179
+
180
+ credit_card_fraud_schema = Schema(
181
+ prediction_id_column_name="prediction_id",
182
+ prediction_label_column_name="predicted_label",
183
+ prediction_score_column_name="predicted_score",
184
+ actual_label_column_name="actual_label",
185
+ timestamp_column_name="prediction_timestamp",
186
+ tag_column_names=["age"],
187
+ embedding_feature_column_names={
188
+ "tabular_embedding": EmbeddingColumnNames(vector_column_name="tabular_vector"),
189
+ },
190
+ )
191
+ credit_card_fraud_fixture = Fixture(
192
+ name="credit_card_fraud",
193
+ description="""
194
+ Use-case for a credit card fraud detection model at a large bank or payment
195
+ processing company.
196
+
197
+ You have been alerted by a spike in credit card chargebacks leading you to
198
+ suspect that fraudsters are getting away with committing fraud undetected!
199
+
200
+ Realizing that this flaw in your model's performance has a heavy cost on
201
+ your company and customers, you understand the need for a powerful tools to
202
+ troubleshoot and prevent costly model degradations. You turn to Phoenix to
203
+ find out what changed in your credit card fraud detection model and how you
204
+ can improve it.
205
+ """,
206
+ primary_schema=credit_card_fraud_schema,
207
+ reference_schema=credit_card_fraud_schema,
208
+ prefix="structured/credit-card-fraud",
209
+ primary_file_name="credit_card_fraud_production.parquet",
210
+ reference_file_name="credit_card_fraud_train.parquet",
211
+ )
212
+
213
+ click_through_rate_schema = Schema(
214
+ timestamp_column_name="prediction_timestamp",
215
+ prediction_id_column_name="prediction_id",
216
+ prediction_label_column_name="predicted_label",
217
+ prediction_score_column_name="predicted_score",
218
+ actual_label_column_name="actual_label",
219
+ )
220
+ click_through_rate_fixture = Fixture(
221
+ name="click_through_rate",
222
+ description="""
223
+ Investigate various performance related aspects of an online advertisement
224
+ use-case. These datasets are designed for analyzing Click-through Rate (CTR)
225
+ performance.
226
+
227
+ You manage the models for an online advertising platform. You have spent a
228
+ great deal of your time collecting online data and training models for best
229
+ performance. With your models now in production you have no tools available
230
+ to your disposal to monitor the performance of your models, identify any
231
+ issues, or get insights into how to improve your models.
232
+
233
+ This use-case highlights a common advertisement use-case and is tailored for
234
+ analyzing CTR for an ad or ad group.
235
+ """,
236
+ primary_schema=click_through_rate_schema,
237
+ reference_schema=click_through_rate_schema,
238
+ prefix="structured/click-through-rate",
239
+ primary_file_name="click_through_rate_production.parquet",
240
+ reference_file_name="click_through_rate_train.parquet",
241
+ )
242
+
243
+ chatbot_queries_schema = Schema(
244
+ prediction_id_column_name="id",
245
+ prompt_column_names=RetrievalEmbeddingColumnNames(
246
+ vector_column_name="prompt",
247
+ raw_data_column_name="prompt_text",
248
+ context_retrieval_ids_column_name="document_ids",
249
+ context_retrieval_scores_column_name="document_scores",
250
+ ),
251
+ response_column_names="response",
252
+ tag_column_names=[
253
+ "answer_relevancy",
254
+ "context_relevancy",
255
+ "faithfulness",
256
+ "document_similarity_0",
257
+ "document_similarity_1",
258
+ "openai_relevance_0",
259
+ "openai_relevance_1",
260
+ "user_feedback",
261
+ ],
262
+ )
263
+
264
+ chatbot_database_schema = Schema(
265
+ prediction_id_column_name="document_id",
266
+ prompt_column_names=EmbeddingColumnNames(
267
+ vector_column_name="text_vector",
268
+ raw_data_column_name="text",
269
+ ),
270
+ )
271
+
272
+ chatbot_fixture = Fixture(
273
+ name="chatbot",
274
+ description="""
275
+ Investigate RAG performance for a chatbot built on top of Arize's documentation.
276
+ This use-case highlights how embedding visualizations for a RAG application can
277
+ highlight issues with the application's retrieval and performance.
278
+
279
+ The data contains relevance metrics generated by LLM Evals as well as RAGAS.
280
+ """,
281
+ primary_schema=chatbot_queries_schema,
282
+ corpus_schema=chatbot_database_schema,
283
+ prefix="unstructured/llm/chatbot",
284
+ primary_file_name="chatbot_queries_with_ragas.parquet",
285
+ corpus_file_name="chatbot_database_ds.parquet",
286
+ )
287
+
288
+ wide_data_primary_schema = Schema(
289
+ actual_label_column_name="actual_label",
290
+ prediction_label_column_name="predicted_label",
291
+ timestamp_column_name="prediction_ts",
292
+ )
293
+ wide_data_reference_schema = replace(wide_data_primary_schema, timestamp_column_name=None)
294
+ wide_data_fixture = Fixture(
295
+ name="wide_data",
296
+ description="""
297
+ Use-case that for a wide data model (e.g. a large amount of features). For
298
+ developer use only.
299
+ """,
300
+ primary_schema=wide_data_primary_schema,
301
+ reference_schema=wide_data_reference_schema,
302
+ prefix="structured/wide-data",
303
+ primary_file_name="wide_data_production.parquet",
304
+ reference_file_name="wide_data_train.parquet",
305
+ )
306
+
307
+ deep_data_primary_schema = Schema(
308
+ timestamp_column_name="prediction_ts",
309
+ actual_label_column_name="actual_label",
310
+ prediction_label_column_name="predicted_label",
311
+ )
312
+ deep_data_reference_schema = replace(deep_data_primary_schema, timestamp_column_name=None)
313
+ deep_data_fixture = Fixture(
314
+ name="deep_data",
315
+ description="""
316
+ Use-case that for a deep data model (e.g. a lot of feature values).
317
+ For developer use only.
318
+ """,
319
+ primary_schema=deep_data_primary_schema,
320
+ reference_schema=deep_data_reference_schema,
321
+ prefix="structured/deep-data",
322
+ primary_file_name="deep_data_production.parquet",
323
+ reference_file_name="deep_data_train.parquet",
324
+ )
325
+
326
+
327
+ llm_summarization_schema = Schema(
328
+ timestamp_column_name="prediction_timestamp",
329
+ tag_column_names=[
330
+ "rougeL_score",
331
+ "reference_summary",
332
+ ],
333
+ prompt_column_names=EmbeddingColumnNames(
334
+ vector_column_name="article_vector", raw_data_column_name="article"
335
+ ),
336
+ response_column_names=EmbeddingColumnNames(
337
+ vector_column_name="summary_vector", raw_data_column_name="summary"
338
+ ),
339
+ )
340
+ llm_summarization_fixture = Fixture(
341
+ name="llm_summarization",
342
+ description="""
343
+ LLM summarization data.
344
+ """,
345
+ primary_schema=llm_summarization_schema,
346
+ reference_schema=llm_summarization_schema,
347
+ prefix="unstructured/llm/summarization",
348
+ primary_file_name="llm_summarization_prod.parquet",
349
+ reference_file_name="llm_summarization_baseline.parquet",
350
+ )
351
+
352
+ wikipedia_fixture = Fixture(
353
+ name="wiki",
354
+ description="""
355
+ Semantic search dataset including queries, answers, retrievals and corpus
356
+ documents. Queries are sampled from Google’s Natural Questions dataset
357
+ https://ai.google.com/research/NaturalQuestions, and documents (paragraphs)
358
+ are sampled from http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz,
359
+ based on the Simple English Wikipedia. Embeddings for both questions and
360
+ documents are generated from the bi-encoder 'multi-qa-MiniLM-L6-cos-v1',
361
+ which produces normalized vectors that are 384-dimensional. Given a query
362
+ and its embedding, a small subset of documents are first retrieved based on
363
+ cosine similarity search on the document embeddings, then relevance scores
364
+ are computed on the retrieved documents using the cross-encoder
365
+ 'cross-encoder/ms-marco-MiniLM-L-12-v2'. Answers (to the queries) are
366
+ generated from 'deepset/tinyroberta-squad2' using the most relevant
367
+ documents as the context input.
368
+
369
+ References:
370
+ - https://www.sbert.net/examples/applications/semantic-search/README.html
371
+ - https://github.com/UKPLab/sentence-transformers/tree/master/examples/applications/retrieve_rerank
372
+ - https://www.sbert.net/docs/pretrained_models.html
373
+ - https://www.sbert.net/docs/pretrained_cross-encoders.html
374
+ - https://ai.google.com/research/NaturalQuestions
375
+ """, # noqa: E501
376
+ primary_schema=Schema(
377
+ prediction_id_column_name="id",
378
+ prompt_column_names=RetrievalEmbeddingColumnNames(
379
+ vector_column_name="embedding",
380
+ raw_data_column_name="question",
381
+ context_retrieval_ids_column_name="retrievals",
382
+ context_retrieval_scores_column_name="scores",
383
+ ),
384
+ response_column_names="answer",
385
+ ),
386
+ corpus_schema=Schema(
387
+ id_column_name="id",
388
+ document_column_names=EmbeddingColumnNames(
389
+ vector_column_name="embedding",
390
+ raw_data_column_name="text",
391
+ ),
392
+ ),
393
+ prefix="unstructured/search/wiki",
394
+ primary_file_name="queries.parquet",
395
+ # The reference file is intended purely for troubleshooting bad data. Un-comment for testing.
396
+ # reference_file_name="queries2.parquet",
397
+ corpus_file_name="corpus.parquet",
398
+ )
399
+
400
+ FIXTURES: Tuple[Fixture, ...] = (
401
+ sentiment_classification_language_drift_fixture,
402
+ image_classification_fixture,
403
+ fashion_mnist_fixture,
404
+ ner_token_drift_fixture,
405
+ credit_card_fraud_fixture,
406
+ click_through_rate_fixture,
407
+ wide_data_fixture,
408
+ deep_data_fixture,
409
+ llm_summarization_fixture,
410
+ wikipedia_fixture,
411
+ chatbot_fixture,
412
+ )
413
+ NAME_TO_FIXTURE = {fixture.name: fixture for fixture in FIXTURES}
414
+
415
+
416
+ def get_datasets(
417
+ fixture_name: str,
418
+ no_internet: bool = False,
419
+ ) -> Tuple[Inferences, Optional[Inferences], Optional[Inferences]]:
420
+ """
421
+ Downloads primary and reference datasets for a fixture if they are not found
422
+ locally.
423
+ """
424
+ fixture = _get_fixture_by_name(fixture_name=fixture_name)
425
+ if no_internet:
426
+ paths = {role: DATASET_DIR / path for role, path in fixture.paths()}
427
+ else:
428
+ paths = dict(_download(fixture, DATASET_DIR))
429
+ primary_dataset = Inferences(
430
+ read_parquet(paths[DatasetRole.PRIMARY]),
431
+ fixture.primary_schema,
432
+ "production",
433
+ )
434
+ reference_dataset = None
435
+ if fixture.reference_file_name is not None:
436
+ reference_dataset = Inferences(
437
+ read_parquet(paths[DatasetRole.REFERENCE]),
438
+ fixture.reference_schema
439
+ if fixture.reference_schema is not None
440
+ else fixture.primary_schema,
441
+ "training",
442
+ )
443
+ corpus_dataset = None
444
+ if fixture.corpus_file_name is not None:
445
+ corpus_dataset = Inferences(
446
+ read_parquet(paths[DatasetRole.CORPUS]),
447
+ fixture.corpus_schema,
448
+ "knowledge_base",
449
+ )
450
+ return primary_dataset, reference_dataset, corpus_dataset
451
+
452
+
453
+ def _get_fixture_by_name(fixture_name: str) -> Fixture:
454
+ """
455
+ Returns the fixture whose name matches the input name. Raises a ValueError
456
+ if the input fixture name does not match any known fixture names.
457
+ """
458
+ if fixture_name not in NAME_TO_FIXTURE:
459
+ valid_fixture_names = ", ".join(NAME_TO_FIXTURE.keys())
460
+ raise ValueError(f'"{fixture_name}" is invalid. Valid names are: {valid_fixture_names}')
461
+ return NAME_TO_FIXTURE[fixture_name]
462
+
463
+
464
+ @dataclass
465
+ class ExampleInferences:
466
+ """
467
+ A primary and optional reference dataset pair.
468
+ """
469
+
470
+ primary: Inferences
471
+ reference: Optional[Inferences] = None
472
+ corpus: Optional[Inferences] = None
473
+
474
+
475
+ def load_example(use_case: str) -> ExampleInferences:
476
+ """
477
+ Loads an example primary and reference dataset for a given use-case.
478
+
479
+ Parameters
480
+ ----------
481
+ use_case: str
482
+ Name of the phoenix supported use case Valid values include:
483
+ - "sentiment_classification_language_drift"
484
+ - "image_classification"
485
+ - "fashion_mnist"
486
+ - "ner_token_drift"
487
+ - "credit_card_fraud"
488
+ - "click_through_rate"
489
+
490
+
491
+ Returns
492
+ _______
493
+ datasets: DatasetDict
494
+ A dictionary of datasets, split out by dataset type (primary,
495
+ reference).
496
+
497
+ """
498
+ fixture = _get_fixture_by_name(use_case)
499
+ primary_dataset, reference_dataset, corpus_dataset = get_datasets(use_case)
500
+ print(f"📥 Loaded {use_case} example datasets.")
501
+ print("ℹ️ About this use-case:")
502
+ print(fixture.description)
503
+ return ExampleInferences(
504
+ primary=primary_dataset,
505
+ reference=reference_dataset,
506
+ corpus=corpus_dataset,
507
+ )
508
+
509
+
510
+ class Metadata(NamedTuple):
511
+ path: str
512
+ mediaLink: str
513
+ md5Hash: str
514
+
515
+ def save_artifact(self, location: Path) -> Path:
516
+ data_file_path = location / self.path
517
+ md5_file = data_file_path.with_name(data_file_path.stem + ".md5")
518
+ data_file_path.parents[0].mkdir(parents=True, exist_ok=True)
519
+ if data_file_path.is_file() and md5_file.is_file():
520
+ with open(md5_file, "r") as f:
521
+ if f.readline() == self.md5Hash:
522
+ return data_file_path
523
+ request.urlretrieve(self.mediaLink, data_file_path)
524
+ with open(md5_file, "w") as f:
525
+ f.write(self.md5Hash)
526
+ return data_file_path
527
+
528
+
529
+ class GCSAssets(NamedTuple):
530
+ host: str = "https://storage.googleapis.com/"
531
+ bucket: str = "arize-assets"
532
+ prefix: str = "phoenix/datasets/"
533
+
534
+ def metadata(self, path: Path) -> Metadata:
535
+ url = urljoin(
536
+ urljoin(self.host, f"storage/v1/b/{self.bucket}/o/"),
537
+ quote(urljoin(self.prefix, str(path)), safe=""),
538
+ )
539
+ resp = json.loads(request.urlopen(request.Request(url)).read())
540
+ return Metadata(
541
+ resp["name"][len(self.prefix) :],
542
+ resp["mediaLink"],
543
+ resp["md5Hash"],
544
+ )
545
+
546
+
547
+ def _download(fixture: Fixture, location: Path) -> Iterator[Tuple[DatasetRole, Path]]:
548
+ for role, path in fixture.paths():
549
+ yield role, GCSAssets().metadata(path).save_artifact(location)
550
+
551
+
552
+ # Download all fixtures
553
+ if __name__ == "__main__":
554
+ import time
555
+
556
+ for fixture in FIXTURES:
557
+ start_time = time.time()
558
+ print(f"getting {fixture.name}", end="...")
559
+ dict(_download(fixture, DATASET_DIR))
560
+ print(f"done ({time.time() - start_time:.2f}s)")