stretchy-model 0.6.5 → 0.6.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +2 -1
- data/README.md +28 -10
- data/Rakefile +56 -0
- data/docs/.nojekyll +0 -0
- data/docs/README.md +147 -0
- data/docs/_coverpage.md +14 -0
- data/docs/_sidebar.md +14 -0
- data/docs/examples/_sidebar.md +15 -0
- data/docs/examples/data_analysis.md +216 -0
- data/docs/examples/semantic_search_with_llm.md +83 -0
- data/docs/examples/simple-ingest-pipeline.md +326 -0
- data/docs/guides/_sidebar.md +14 -0
- data/docs/guides/aggregations.md +142 -0
- data/docs/guides/machine-learning.md +154 -0
- data/docs/guides/models.md +372 -0
- data/docs/guides/pipelines.md +151 -0
- data/docs/guides/querying.md +361 -0
- data/docs/guides/quick-start.md +72 -0
- data/docs/guides/scopes.md +125 -0
- data/docs/index.html +113 -0
- data/docs/stretchy.cover.png +0 -0
- data/docs/stretchy.logo.png +0 -0
- data/docs/styles.css +90 -0
- data/lib/stretchy/attributes/transformers/keyword_transformer.rb +41 -35
- data/lib/stretchy/attributes/type/array.rb +24 -1
- data/lib/stretchy/attributes/type/base.rb +6 -2
- data/lib/stretchy/attributes/type/binary.rb +24 -17
- data/lib/stretchy/attributes/type/boolean.rb +29 -22
- data/lib/stretchy/attributes/type/completion.rb +18 -10
- data/lib/stretchy/attributes/type/constant_keyword.rb +35 -26
- data/lib/stretchy/attributes/type/date_time.rb +28 -17
- data/lib/stretchy/attributes/type/dense_vector.rb +46 -49
- data/lib/stretchy/attributes/type/flattened.rb +28 -19
- data/lib/stretchy/attributes/type/geo_point.rb +21 -12
- data/lib/stretchy/attributes/type/geo_shape.rb +21 -12
- data/lib/stretchy/attributes/type/hash.rb +24 -10
- data/lib/stretchy/attributes/type/histogram.rb +25 -0
- data/lib/stretchy/attributes/type/ip.rb +26 -17
- data/lib/stretchy/attributes/type/join.rb +16 -7
- data/lib/stretchy/attributes/type/keyword.rb +21 -26
- data/lib/stretchy/attributes/type/knn_vector.rb +47 -0
- data/lib/stretchy/attributes/type/match_only_text.rb +22 -1
- data/lib/stretchy/attributes/type/nested.rb +16 -11
- data/lib/stretchy/attributes/type/numeric/base.rb +30 -22
- data/lib/stretchy/attributes/type/numeric/byte.rb +20 -0
- data/lib/stretchy/attributes/type/numeric/double.rb +20 -0
- data/lib/stretchy/attributes/type/numeric/float.rb +20 -0
- data/lib/stretchy/attributes/type/numeric/half_float.rb +20 -0
- data/lib/stretchy/attributes/type/numeric/integer.rb +21 -1
- data/lib/stretchy/attributes/type/numeric/long.rb +20 -0
- data/lib/stretchy/attributes/type/numeric/scaled_float.rb +16 -7
- data/lib/stretchy/attributes/type/numeric/short.rb +20 -0
- data/lib/stretchy/attributes/type/numeric/unsigned_long.rb +21 -1
- data/lib/stretchy/attributes/type/percolator.rb +16 -4
- data/lib/stretchy/attributes/type/point.rb +19 -9
- data/lib/stretchy/attributes/type/range/base.rb +24 -1
- data/lib/stretchy/attributes/type/range/date_range.rb +21 -5
- data/lib/stretchy/attributes/type/range/double_range.rb +20 -4
- data/lib/stretchy/attributes/type/range/float_range.rb +21 -5
- data/lib/stretchy/attributes/type/range/integer_range.rb +20 -4
- data/lib/stretchy/attributes/type/range/ip_range.rb +20 -4
- data/lib/stretchy/attributes/type/range/long_range.rb +20 -4
- data/lib/stretchy/attributes/type/rank_feature.rb +16 -6
- data/lib/stretchy/attributes/type/rank_features.rb +16 -9
- data/lib/stretchy/attributes/type/search_as_you_type.rb +28 -18
- data/lib/stretchy/attributes/type/shape.rb +19 -9
- data/lib/stretchy/attributes/type/sparse_vector.rb +25 -21
- data/lib/stretchy/attributes/type/string.rb +42 -1
- data/lib/stretchy/attributes/type/text.rb +53 -28
- data/lib/stretchy/attributes/type/token_count.rb +21 -11
- data/lib/stretchy/attributes/type/version.rb +16 -6
- data/lib/stretchy/attributes/type/wildcard.rb +36 -25
- data/lib/stretchy/attributes.rb +29 -0
- data/lib/stretchy/delegation/gateway_delegation.rb +78 -0
- data/lib/stretchy/index_setting.rb +94 -0
- data/lib/stretchy/indexing/bulk.rb +75 -3
- data/lib/stretchy/model/callbacks.rb +1 -0
- data/lib/stretchy/model/common.rb +157 -0
- data/lib/stretchy/model/persistence.rb +144 -0
- data/lib/stretchy/model/refreshable.rb +26 -0
- data/lib/stretchy/pipeline.rb +2 -1
- data/lib/stretchy/pipelines/processor.rb +38 -36
- data/lib/stretchy/querying.rb +7 -8
- data/lib/stretchy/record.rb +5 -4
- data/lib/stretchy/relation.rb +229 -28
- data/lib/stretchy/relations/aggregation_methods/aggregation.rb +59 -0
- data/lib/stretchy/relations/aggregation_methods/avg.rb +45 -0
- data/lib/stretchy/relations/aggregation_methods/bucket_script.rb +47 -0
- data/lib/stretchy/relations/aggregation_methods/bucket_selector.rb +47 -0
- data/lib/stretchy/relations/aggregation_methods/bucket_sort.rb +47 -0
- data/lib/stretchy/relations/aggregation_methods/cardinality.rb +47 -0
- data/lib/stretchy/relations/aggregation_methods/children.rb +47 -0
- data/lib/stretchy/relations/aggregation_methods/composite.rb +41 -0
- data/lib/stretchy/relations/aggregation_methods/date_histogram.rb +53 -0
- data/lib/stretchy/relations/aggregation_methods/date_range.rb +53 -0
- data/lib/stretchy/relations/aggregation_methods/extended_stats.rb +48 -0
- data/lib/stretchy/relations/aggregation_methods/filter.rb +47 -0
- data/lib/stretchy/relations/aggregation_methods/filters.rb +47 -0
- data/lib/stretchy/relations/aggregation_methods/geo_bounds.rb +40 -0
- data/lib/stretchy/relations/aggregation_methods/geo_centroid.rb +40 -0
- data/lib/stretchy/relations/aggregation_methods/global.rb +39 -0
- data/lib/stretchy/relations/aggregation_methods/histogram.rb +43 -0
- data/lib/stretchy/relations/aggregation_methods/ip_range.rb +41 -0
- data/lib/stretchy/relations/aggregation_methods/max.rb +40 -0
- data/lib/stretchy/relations/aggregation_methods/min.rb +41 -0
- data/lib/stretchy/relations/aggregation_methods/missing.rb +40 -0
- data/lib/stretchy/relations/aggregation_methods/nested.rb +40 -0
- data/lib/stretchy/relations/aggregation_methods/percentile_ranks.rb +45 -0
- data/lib/stretchy/relations/aggregation_methods/percentiles.rb +45 -0
- data/lib/stretchy/relations/aggregation_methods/range.rb +42 -0
- data/lib/stretchy/relations/aggregation_methods/reverse_nested.rb +40 -0
- data/lib/stretchy/relations/aggregation_methods/sampler.rb +40 -0
- data/lib/stretchy/relations/aggregation_methods/scripted_metric.rb +43 -0
- data/lib/stretchy/relations/aggregation_methods/significant_terms.rb +45 -0
- data/lib/stretchy/relations/aggregation_methods/stats.rb +42 -0
- data/lib/stretchy/relations/aggregation_methods/sum.rb +42 -0
- data/lib/stretchy/relations/aggregation_methods/terms.rb +46 -0
- data/lib/stretchy/relations/aggregation_methods/top_hits.rb +42 -0
- data/lib/stretchy/relations/aggregation_methods/top_metrics.rb +44 -0
- data/lib/stretchy/relations/aggregation_methods/value_count.rb +41 -0
- data/lib/stretchy/relations/aggregation_methods/weighted_avg.rb +42 -0
- data/lib/stretchy/relations/aggregation_methods.rb +20 -749
- data/lib/stretchy/relations/finder_methods.rb +2 -18
- data/lib/stretchy/relations/null_relation.rb +55 -0
- data/lib/stretchy/relations/query_builder.rb +82 -36
- data/lib/stretchy/relations/query_methods/bind.rb +19 -0
- data/lib/stretchy/relations/query_methods/extending.rb +29 -0
- data/lib/stretchy/relations/query_methods/fields.rb +70 -0
- data/lib/stretchy/relations/query_methods/filter_query.rb +53 -0
- data/lib/stretchy/relations/query_methods/has_field.rb +40 -0
- data/lib/stretchy/relations/query_methods/highlight.rb +75 -0
- data/lib/stretchy/relations/query_methods/hybrid.rb +60 -0
- data/lib/stretchy/relations/query_methods/ids.rb +40 -0
- data/lib/stretchy/relations/query_methods/match.rb +52 -0
- data/lib/stretchy/relations/query_methods/must_not.rb +54 -0
- data/lib/stretchy/relations/query_methods/neural.rb +58 -0
- data/lib/stretchy/relations/query_methods/neural_sparse.rb +43 -0
- data/lib/stretchy/relations/query_methods/none.rb +21 -0
- data/lib/stretchy/relations/query_methods/or_filter.rb +21 -0
- data/lib/stretchy/relations/query_methods/order.rb +63 -0
- data/lib/stretchy/relations/query_methods/query_string.rb +44 -0
- data/lib/stretchy/relations/query_methods/regexp.rb +61 -0
- data/lib/stretchy/relations/query_methods/should.rb +51 -0
- data/lib/stretchy/relations/query_methods/size.rb +44 -0
- data/lib/stretchy/relations/query_methods/skip_callbacks.rb +47 -0
- data/lib/stretchy/relations/query_methods/source.rb +59 -0
- data/lib/stretchy/relations/query_methods/where.rb +113 -0
- data/lib/stretchy/relations/query_methods.rb +48 -569
- data/lib/stretchy/relations/scoping/default.rb +136 -0
- data/lib/stretchy/relations/scoping/named.rb +70 -0
- data/lib/stretchy/relations/scoping/scope_registry.rb +36 -0
- data/lib/stretchy/relations/scoping.rb +30 -0
- data/lib/stretchy/relations/search_option_methods.rb +2 -0
- data/lib/stretchy/version.rb +1 -1
- data/lib/stretchy.rb +17 -10
- metadata +111 -17
- data/lib/stretchy/common.rb +0 -38
- data/lib/stretchy/null_relation.rb +0 -53
- data/lib/stretchy/persistence.rb +0 -43
- data/lib/stretchy/refreshable.rb +0 -15
- data/lib/stretchy/scoping/default.rb +0 -134
- data/lib/stretchy/scoping/named.rb +0 -68
- data/lib/stretchy/scoping/scope_registry.rb +0 -34
- data/lib/stretchy/scoping.rb +0 -28
@@ -0,0 +1,326 @@
|
|
1
|
+
# Simple Ingest Pipeline
|
2
|
+
|
3
|
+
>[!INFO|style:flat:label:Prerequisites]
|
4
|
+
>
|
5
|
+
>- Opensearch 2.12+ installed and running
|
6
|
+
>- Ruby on Rails
|
7
|
+
>
|
8
|
+
>Follow the [Quick Start](guides/quick-start) for detailed steps.
|
9
|
+
|
10
|
+
## Data Source
|
11
|
+
|
12
|
+
Our data source is a JSON data representing vitals and patient information scraped from the dark web (jk). We have an id, vitals as a CSV, full name, age and a SSN with HTML tags. What a mess!
|
13
|
+
|
14
|
+
|
15
|
+
| id | vitals | name | age | ssn |
|
16
|
+
| ------- | ---------- | -------------------------- | --- | ------------------ |
|
17
|
+
| ta0j288 | 700,120,72 | Gov. Candy Williams | 30 | <b>547-93-4227</b> |
|
18
|
+
| ta0j288 | 56,120,72 | Romana Prohaska | 30 | <b>547-93-4227</b> |
|
19
|
+
| pnunl70 | 114,136,43 | Tristan Waelchi | 81 | <b>323-23-5997</b> |
|
20
|
+
| 8lhscax | 105,66,56 | Antoine Hauck | 46 | <b>381-54-5352</b> |
|
21
|
+
| impcbo9 | 119,78,60 | Dewayne Stark | 39 | <b>816-86-6698</b> |
|
22
|
+
| jxr8h3v | 81,69,58 | Shelton Powlowski | 77 | <b>810-63-7478</b> |
|
23
|
+
| d7lwaln | 103,140,93 | Sage Medhurst | 19 | <b>470-43-3841</b> |
|
24
|
+
| ryrtjb5 | 57,118,86 | Tobias Strosin | 76 | <b>197-25-4397</b> |
|
25
|
+
| ox227l3 | 82,103,98 | Jessi Barton | 41 | <b>700-41-0042</b> |
|
26
|
+
| c7vyqu2 | 103,73,90 | Eliseo Feest | 53 | <b>153-01-6678</b> |
|
27
|
+
| i8lbviz | 81,120,91 | The Hon. Zandra Dibbert MD | 55 | <b>881-10-7835</b> |
|
28
|
+
|
29
|
+
Our goal is to create an ingest pipeline using `stretchy-model` to process this data. The pipeline will transform the vitals from a CSV into an array, remove the HTML tags from the SSN, and split the full name into first and last names.
|
30
|
+
|
31
|
+
## Define the Pipeline
|
32
|
+
|
33
|
+
An ingest pipeline in Elasticsearch allows us to pre-process documents before the actual document indexing occurs. It's a way to transform and enrich the data, making it more useful and easier to work with.
|
34
|
+
|
35
|
+
```mermaid
|
36
|
+
flowchart LR
|
37
|
+
|
38
|
+
CSV --> SCRIPT
|
39
|
+
|
40
|
+
SCRIPT --> HTML_STRIP
|
41
|
+
|
42
|
+
HTML_STRIP --> CONVERT
|
43
|
+
|
44
|
+
CONVERT --> REMOVE
|
45
|
+
|
46
|
+
REMOVE --> INDEX
|
47
|
+
|
48
|
+
```
|
49
|
+
By doing these transformations as part of the ingest process, we ensure that the data is in the right format and structure for our needs right from the moment it enters Elasticsearch. This makes our subsequent data handling and analysis tasks much easier and more efficient.
|
50
|
+
|
51
|
+
_app/pipelines/intake_form_pipeline.rb_
|
52
|
+
|
53
|
+
```ruby
|
54
|
+
class IntakeFormPipeline < Stretchy::Pipeline
|
55
|
+
|
56
|
+
description "Ingests intake forms and scrubs ssn of html"
|
57
|
+
|
58
|
+
processor :csv,
|
59
|
+
field: :vitals,
|
60
|
+
target_fields: [:heart_rate, :systolic, :diastolic],
|
61
|
+
trim: true
|
62
|
+
|
63
|
+
processor :script,
|
64
|
+
description: "Extracts first and last name from name field",
|
65
|
+
lang: "painless",
|
66
|
+
source: <<~PAINLESS
|
67
|
+
ctx['name'] = /^[\\w\\s]+\\.\\s/.matcher(ctx['name']).replaceAll("");
|
68
|
+
String[] parts = /\\s+/.split(ctx['name']);
|
69
|
+
ctx['first_name'] = parts[0];
|
70
|
+
if (parts.length > 1) {
|
71
|
+
ctx['last_name'] = parts[1];
|
72
|
+
}
|
73
|
+
PAINLESS
|
74
|
+
|
75
|
+
processor :html_strip, field: :ssn
|
76
|
+
processor :convert, field: :systolic, type: :integer
|
77
|
+
processor :convert, field: :diastolic, type: :integer
|
78
|
+
processor :convert, field: :heart_rate, type: :integer
|
79
|
+
|
80
|
+
processor :remove, field: :name
|
81
|
+
processor :remove, field: :vitals
|
82
|
+
|
83
|
+
end
|
84
|
+
```
|
85
|
+
|
86
|
+
The `IntakeFormPipeline` will preprocess documents that are sent to be indexed. We have a `description` and a series of `processor` statements, each performing a specific transformation on the data:
|
87
|
+
|
88
|
+
- **csv** - parse `vitals` and map them to `heart_rate`, `systolic` and `diastolic` fields
|
89
|
+
- **script** - split `name` into `first_name` and `last_name`, removing any titles like Dr., Rev. etc.
|
90
|
+
- **html_strip** - scrub `ssn` of any HTML tags
|
91
|
+
- **convert** - ensure vitals are all integers
|
92
|
+
- **remove** - remove the fields we no longer need
|
93
|
+
|
94
|
+
**Create the pipeline:**
|
95
|
+
|
96
|
+
This command sends a request to the Elasticsearch server to create a new ingest pipeline with the specifications defined in the IntakeFormPipeline class.
|
97
|
+
|
98
|
+
Once the pipeline is created, it's ready to preprocess any documents that are sent to be indexed. The transformations defined in the pipeline (such as parsing CSVs, splitting names, stripping HTML tags, and converting fields to integers) will be applied to each document before it's indexed.
|
99
|
+
|
100
|
+
```ruby
|
101
|
+
IntakeFormPipeline.create!
|
102
|
+
```
|
103
|
+
|
104
|
+
**Response:**
|
105
|
+
```ruby
|
106
|
+
#=> {"acknowledged"=>true}
|
107
|
+
```
|
108
|
+
|
109
|
+
Remember, the pipeline only needs to be created once. After it's created, it will be used automatically whenever documents are indexed in Elasticsearch. If you need to change the pipeline, you can remove it with the `IntakeFormPipeline.delete! command.
|
110
|
+
|
111
|
+
## Describe the Model
|
112
|
+
|
113
|
+
The `IntakeForm` model represents the index were we’ll store our intake forms.
|
114
|
+
|
115
|
+
*app/models/intake_form.rb*
|
116
|
+
|
117
|
+
```ruby
|
118
|
+
class IntakeForm < StretchyModel
|
119
|
+
attribute :first_name, :keyword
|
120
|
+
attribute :last_name, :keyword
|
121
|
+
attribute :ssn, :keyword
|
122
|
+
attribute :age, :integer
|
123
|
+
attribute :heart_rate, :integer
|
124
|
+
attribute :systolic, :integer
|
125
|
+
attribute :diastolic, :integer
|
126
|
+
|
127
|
+
default_pipeline :intake_form_pipeline
|
128
|
+
end
|
129
|
+
```
|
130
|
+
The IntakeForm model inherits from `StretchyModel`, which means it gets all the functionality provided by `StretchyModel`.
|
131
|
+
|
132
|
+
The `attribute` method is used to define the fields of the IntakeForm model. Each `attribute` has a name and a type. The type corresponds to the Elasticsearch field type.
|
133
|
+
|
134
|
+
The `default_pipeline` method sets the default ingest pipeline for the model. In this case, it's set to `:intake_form_pipeline`, which means that the intake_form_pipeline will be used to preprocess documents before they are indexed.
|
135
|
+
|
136
|
+
|
137
|
+
**Create the index:**
|
138
|
+
|
139
|
+
```ruby
|
140
|
+
IntakeForm.create_index!
|
141
|
+
```
|
142
|
+
|
143
|
+
**Response:**
|
144
|
+
```ruby
|
145
|
+
#=> {"acknowledged"=>true, "shards_acknowledged"=>true, "index"=>"intake_forms"}
|
146
|
+
```
|
147
|
+
|
148
|
+
## Run the pipeline
|
149
|
+
|
150
|
+
To run the pipeline, you'll need to index documents using the `IntakeForm` model. The `default_pipeline` you set earlier will automatically preprocess the documents before they are indexed.
|
151
|
+
|
152
|
+
```ruby
|
153
|
+
initial_data = [
|
154
|
+
{"id": "ta0j288", "vitals": "700,120,72", "name": "Gov. Candy Williams", "age": 30, "ssn": "<b>547-93-4227</b>"},
|
155
|
+
{"id": "ta0j288", "vitals": "56,120,72", "name": "Romana Prohaska", "age": 30, "ssn": "<b>547-93-4227</b>"},
|
156
|
+
{"id": "pnunl70", "vitals": "114,136,43", "name": "Tristan Waelchi", "age": 81, "ssn": "<b>323-23-5997</b>"},
|
157
|
+
{"id": "8lhscax", "vitals": "105,66,56", "name": "Antoine Hauck", "age": 46, "ssn": "<b>381-54-5352</b>"},
|
158
|
+
{"id": "impcbo9", "vitals": "119,78,60", "name": "Dewayne Stark", "age": 39, "ssn": "<b>816-86-6698</b>"},
|
159
|
+
{"id": "jxr8h3v", "vitals": "81,69,58", "name": "Shelton Powlowski", "age": 77, "ssn": "<b>810-63-7478</b>"},
|
160
|
+
{"id": "d7lwaln", "vitals": "103,140,93", "name": "Sage Medhurst", "age": 19, "ssn": "<b>470-43-3841</b>"},
|
161
|
+
{"id": "ryrtjb5", "vitals": "57,118,86", "name": "Tobias Strosin", "age": 76, "ssn": "<b>197-25-4397</b>"},
|
162
|
+
{"id": "ox227l3", "vitals": "82,103,98", "name": "Jessi Barton", "age": 41, "ssn": "<b>700-41-0042</b>"},
|
163
|
+
{"id": "c7vyqu2", "vitals": "103,73,90", "name": "Eliseo Feest", "age": 53, "ssn": "<b>153-01-6678</b>"},
|
164
|
+
{"id": "i8lbviz", "vitals": "81,120,91", "name": "The Hon. Zandra Dibbert MD", "age": 55, "ssn": "<b>881-10-7835</b>"}
|
165
|
+
]
|
166
|
+
```
|
167
|
+
|
168
|
+
#### Simulate
|
169
|
+
We can simulate `IntakeFormPipeline` to make sure it works as expected.
|
170
|
+
|
171
|
+
```ruby
|
172
|
+
docs = initial_data.map {|doc| {_source: doc} }
|
173
|
+
```
|
174
|
+
|
175
|
+
We prepare the initial data by making sure each entry has a `_source` field with the document as the value. This is slightly different than how we'll prepare the data for actual indexing.
|
176
|
+
|
177
|
+
**Simulate the pipeline:**
|
178
|
+
|
179
|
+
```ruby
|
180
|
+
IntakeFormPipeline.simulate(docs)
|
181
|
+
```
|
182
|
+
|
183
|
+
**Response:**
|
184
|
+
```ruby
|
185
|
+
#=> {"docs"=>
|
186
|
+
# [{"processor_results"=>
|
187
|
+
# [{"processor_type"=>"csv",
|
188
|
+
# "status"=>"success",
|
189
|
+
# "doc"=>
|
190
|
+
# {"_index"=>"_index",
|
191
|
+
# "_type"=>"_doc",
|
192
|
+
# "_id"=>"_id",
|
193
|
+
# "_source"=>{"systolic"=>"120", "diastolic"=>"72", "name"=>"Gov. Candy Williams", #"heart_rate"=>"700", "id"=>"ta0j288", "vitals"=>"700,120,72", "age"=>30, "ssn"=>"<b>547-93-4227</#b>"},
|
194
|
+
# "_ingest"=>{"pipeline"=>"intake_form_pipeline", "timestamp"=>"2024-03-20T13:06:17.661745464Z"}}},
|
195
|
+
# {"processor_type"=>"script",
|
196
|
+
# "status"=>"success",
|
197
|
+
# "description"=>"Extracts first and last name from name field",
|
198
|
+
# "doc"=>
|
199
|
+
# {"_index"=>"_index",
|
200
|
+
# "_type"=>"_doc",
|
201
|
+
# "_id"=>"_id",
|
202
|
+
# "_source"=>{"heart_rate"=>"700", "last_name"=>"Williams", "ssn"=>"<b>547-93-4227</b>", #"systolic"=>"120", "diastolic"=>"72", "name"=>"Candy Williams", "id"=>"ta0j288", #"first_name"=>"Candy", "vitals"=>"700,120,72", "age"=>30},
|
203
|
+
# "_ingest"=>{"pipeline"=>"intake_form_pipeline", "timestamp"=>"2024-03-20T13:06:17.661745464Z"}}},
|
204
|
+
# ...
|
205
|
+
```
|
206
|
+
|
207
|
+
The response should show the results of simulation, with each processor step and it’s status.
|
208
|
+
|
209
|
+
#### Ingest
|
210
|
+
|
211
|
+
Now, let’s ingest the data into the index. We’ll use a bulk request to index our documents:
|
212
|
+
|
213
|
+
```ruby
|
214
|
+
bulk_records = initial_data.map do |data|
|
215
|
+
{ index: { _index: IntakeForm.index_name, data: data } }
|
216
|
+
end
|
217
|
+
|
218
|
+
IntakeForm.bulk(bulk_records)
|
219
|
+
```
|
220
|
+
|
221
|
+
**Response:**
|
222
|
+
```ruby
|
223
|
+
=>
|
224
|
+
{"took"=>3,
|
225
|
+
"ingest_took"=>1,
|
226
|
+
"errors"=>false,
|
227
|
+
"items"=>
|
228
|
+
[{"index"=>{"_index"=>"intake_forms", "_type"=>"_doc", "_id"=>"vzz8W44BuORXSU88oIRr", "_version"=>1, "result"=>"created", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>0, "_primary_term"=>1, "status"=>201}},
|
229
|
+
{"index"=>{"_index"=>"intake_forms", "_type"=>"_doc", "_id"=>"wDz8W44BuORXSU88oIRr", "_version"=>1, "result"=>"created", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>1, "_primary_term"=>1, "status"=>201}},
|
230
|
+
{"index"=>{"_index"=>"intake_forms", "_type"=>"_doc", "_id"=>"wTz8W44BuORXSU88oIRr", "_version"=>1, "result"=>"created", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>2, "_primary_term"=>1, "status"=>201}},
|
231
|
+
{"index"=>{"_index"=>"intake_forms", "_type"=>"_doc", "_id"=>"wjz8W44BuORXSU88oIRr", "_version"=>1, "result"=>"created", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>3, "_primary_term"=>1, "status"=>201}},
|
232
|
+
{"index"=>{"_index"=>"intake_forms", "_type"=>"_doc", "_id"=>"wzz8W44BuORXSU88oIRr", "_version"=>1, "result"=>"created", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>4, "_primary_term"=>1, "status"=>201}},
|
233
|
+
{"index"=>{"_index"=>"intake_forms", "_type"=>"_doc", "_id"=>"xDz8W44BuORXSU88oIRr", "_version"=>1, "result"=>"created", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>5, "_primary_term"=>1, "status"=>201}},
|
234
|
+
{"index"=>{"_index"=>"intake_forms", "_type"=>"_doc", "_id"=>"xTz8W44BuORXSU88oIRr", "_version"=>1, "result"=>"created", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>6, "_primary_term"=>1, "status"=>201}},
|
235
|
+
{"index"=>{"_index"=>"intake_forms", "_type"=>"_doc", "_id"=>"xjz8W44BuORXSU88oIRr", "_version"=>1, "result"=>"created", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>7, "_primary_term"=>1, "status"=>201}},
|
236
|
+
{"index"=>{"_index"=>"intake_forms", "_type"=>"_doc", "_id"=>"xzz8W44BuORXSU88oIRr", "_version"=>1, "result"=>"created", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>8, "_primary_term"=>1, "status"=>201}},
|
237
|
+
{"index"=>{"_index"=>"intake_forms", "_type"=>"_doc", "_id"=>"yDz8W44BuORXSU88oIRr", "_version"=>1, "result"=>"created", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>9, "_primary_term"=>1, "status"=>201}},
|
238
|
+
{"index"=>{"_index"=>"intake_forms", "_type"=>"_doc", "_id"=>"yTz8W44BuORXSU88oIRr", "_version"=>1, "result"=>"created", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>10, "_primary_term"=>1, "status"=>201}}]}
|
239
|
+
|
240
|
+
```
|
241
|
+
|
242
|
+
Our ingest pipeline will perform all of the operations we defined as `processors` in the `IntakeFormPipeline` and index the resulting document.
|
243
|
+
|
244
|
+
Let's see how it did:
|
245
|
+
|
246
|
+
```ruby
|
247
|
+
IntakeForm.count
|
248
|
+
#=> 11
|
249
|
+
|
250
|
+
IntakeForm.first.heart_rate
|
251
|
+
#=> 700
|
252
|
+
```
|
253
|
+
|
254
|
+
Wow! The Gov. must be having as much fun as us with a heart rate like that.
|
255
|
+
|
256
|
+
Let's get the average heart rate per age group:
|
257
|
+
|
258
|
+
```ruby
|
259
|
+
results = IntakeForm.range(:ages, {
|
260
|
+
field: :age,
|
261
|
+
ranges: [
|
262
|
+
{from: 19, to: 39},
|
263
|
+
{from: 40, to: 59},
|
264
|
+
{from: 60, to: 79},
|
265
|
+
{from: 80}
|
266
|
+
],
|
267
|
+
keyed: true
|
268
|
+
},
|
269
|
+
aggs: {avg_heart_rate: {avg: {field: :heart_rate}}}).size(0)
|
270
|
+
|
271
|
+
ap results.aggregations.ages.buckets
|
272
|
+
```
|
273
|
+
|
274
|
+
**Response:**
|
275
|
+
```ruby
|
276
|
+
|
277
|
+
{
|
278
|
+
"19.0-39.0" => {
|
279
|
+
"from" => 19.0,
|
280
|
+
"to" => 39.0,
|
281
|
+
"doc_count" => 3,
|
282
|
+
"avg_heart_rate" => {
|
283
|
+
"value" => 286.3333333333333
|
284
|
+
}
|
285
|
+
},
|
286
|
+
"40.0-59.0" => {
|
287
|
+
"from" => 40.0,
|
288
|
+
"to" => 59.0,
|
289
|
+
"doc_count" => 4,
|
290
|
+
"avg_heart_rate" => {
|
291
|
+
"value" => 92.75
|
292
|
+
}
|
293
|
+
},
|
294
|
+
"60.0-79.0" => {
|
295
|
+
"from" => 60.0,
|
296
|
+
"to" => 79.0,
|
297
|
+
"doc_count" => 2,
|
298
|
+
"avg_heart_rate" => {
|
299
|
+
"value" => 69.0
|
300
|
+
}
|
301
|
+
},
|
302
|
+
"80.0-*" => {
|
303
|
+
"from" => 80.0,
|
304
|
+
"doc_count" => 1,
|
305
|
+
"avg_heart_rate" => {
|
306
|
+
"value" => 114.0
|
307
|
+
}
|
308
|
+
}
|
309
|
+
}
|
310
|
+
```
|
311
|
+
|
312
|
+
In this guide, we've walked through the process of creating an ingest pipeline with Elasticsearch using `stretchy-model`.
|
313
|
+
|
314
|
+
We started with a dataset of patient information, which included fields that needed preprocessing before indexing. We defined an ingest pipeline, `IntakeFormPipeline`, that transformed the data into a more useful format, including parsing CSVs, splitting names, removing HTML tags, and converting fields to integers.
|
315
|
+
|
316
|
+
We then used the `IntakeForm` model, which inherits from `StretchyModel`, to index the preprocessed data in Elasticsearch. We also demonstrated how to run aggregations on the indexed data to get insights, such as the average heart rate per age group.
|
317
|
+
|
318
|
+
This is a simple example, but ingest pipelines can be much more complex and powerful, allowing you to preprocess your data in many different ways before indexing. With `stretchy-model`, you can leverage the full power of Elasticsearch's ingest pipelines while writing Ruby code that feels familiar and idiomatic.
|
319
|
+
|
320
|
+
## Cleaning up
|
321
|
+
|
322
|
+
```ruby
|
323
|
+
IntakeForm.delete_index!
|
324
|
+
IntakeFormPipeline.delete!
|
325
|
+
```
|
326
|
+
|
@@ -0,0 +1,14 @@
|
|
1
|
+
* [__Readme__](/)
|
2
|
+
|
3
|
+
* __Guides__
|
4
|
+
* [Quick Start](guides/quick-start?id=quick-start)
|
5
|
+
* [Models](guides/models?id=models)
|
6
|
+
* [Querying](guides/querying?id=querying)
|
7
|
+
* [Scopes](guides/scopes?id=scopes)
|
8
|
+
* [Aggregations](guides/aggregations?id=aggregations)
|
9
|
+
* [Pipelines](guides/pipelines?id=pipelines)
|
10
|
+
* [Machine Learning](guides/machine-learning?id=machine-learning)
|
11
|
+
|
12
|
+
* __Examples__
|
13
|
+
* [Data Analysis](examples/data_analysis)
|
14
|
+
* [Simple Ingest Pipeline](examples/simple-ingest-pipeline)
|
@@ -0,0 +1,142 @@
|
|
1
|
+
# Aggregations
|
2
|
+
|
3
|
+
Aggregations in Elasticsearch allow you to get summary information about your data. For example, you can use aggregations to count the number of records that match certain criteria, calculate the average value of a field, find the minimum or maximum value, and more.
|
4
|
+
|
5
|
+
|
6
|
+
When performing aggregations it's good practice to set `size(0)` if you don't need the source documents.
|
7
|
+
|
8
|
+
```ruby
|
9
|
+
results = Profile.aggregation(:flagged_counts, terms: {field: :flagged}).size(0)
|
10
|
+
```
|
11
|
+
|
12
|
+
Aggregation results are available on the results aggregations object by the name provided to the aggregation:
|
13
|
+
|
14
|
+
```ruby
|
15
|
+
results.aggregations.flagged_counts
|
16
|
+
```
|
17
|
+
|
18
|
+
returns:
|
19
|
+
```ruby
|
20
|
+
{
|
21
|
+
"doc_count_error_upper_bound"=>0,
|
22
|
+
"sum_other_doc_count"=>0,
|
23
|
+
"buckets"=>[
|
24
|
+
{"key"=>"true", "doc_count"=>123},
|
25
|
+
{"key"=>"false", "doc_count"=>456}
|
26
|
+
]
|
27
|
+
}
|
28
|
+
```
|
29
|
+
|
30
|
+
>[!TIP|label:Accessing Aggregation Results]
|
31
|
+
>You can access the entire structure through dot notation.
|
32
|
+
>
|
33
|
+
>`aggregations.flagged_counts.buckets.first.doc_count` => `123`
|
34
|
+
|
35
|
+
|
36
|
+
---
|
37
|
+
|
38
|
+
In Stretchy, you use the `aggregation` method to define aggregations. Here are some examples:
|
39
|
+
|
40
|
+
### Count by status
|
41
|
+
|
42
|
+
if you have a `status` field and you want to count how many records there are for each status, you can use a terms aggregation:
|
43
|
+
```ruby
|
44
|
+
profile.aggregation(:status_count, terms: { field: :status })
|
45
|
+
```
|
46
|
+
|
47
|
+
|
48
|
+
### Average Age
|
49
|
+
|
50
|
+
If you have an age field and you want to calculate the average age, you can do this:
|
51
|
+
|
52
|
+
```ruby
|
53
|
+
Profile.aggregation(:average_age, avg: { field: :age })
|
54
|
+
```
|
55
|
+
|
56
|
+
### Minimum and Maximum Age
|
57
|
+
|
58
|
+
If you want to find the minimum and maximum age, you can do this:
|
59
|
+
|
60
|
+
```ruby
|
61
|
+
Profile.aggregation(:min_age, min: { field: :age })
|
62
|
+
Profile.aggregation(:max_age, max: { field: :age })
|
63
|
+
```
|
64
|
+
|
65
|
+
### Date Histogram
|
66
|
+
|
67
|
+
If you have a `created_at` field and you want to count how many profiles were created in each month, you can do this:
|
68
|
+
```ruby
|
69
|
+
Profile.aggregation(:profiles_over_time, date_histogram: { field: :created_at, interval: 'month' })
|
70
|
+
```
|
71
|
+
|
72
|
+
In these examples, the first argument to the aggregation method is the name of the aggregation, and the second argument is a hash that defines the aggregation. The key of the hash is the type of the aggregation (terms, avg, min, max, or date_histogram), and the value is another hash that specifies the field to aggregate on and other options.
|
73
|
+
|
74
|
+
## Named Aggregation Helpers
|
75
|
+
|
76
|
+
The above shows how to use the `aggregation` method directly, but Stretchy makes working with named aggregations even easier. Named aggregation helpers make calling the aggregation you want a breeze.
|
77
|
+
|
78
|
+
The documentation goes into depth for all available [aggregation types](/doc/stretchy/relations/AggregationMethods)
|
79
|
+
|
80
|
+
### Percentiles
|
81
|
+
|
82
|
+
The percentiles aggregation method calculates the percentiles of a numeric field. For example, if you want to calculate the 25th, 50th, and 75th percentiles of the age field, you can do this:
|
83
|
+
```ruby
|
84
|
+
Profile.percentiles(:age_percentiles, field: :age, percents: [25, 50, 75])
|
85
|
+
```
|
86
|
+
|
87
|
+
### Extended Stats
|
88
|
+
|
89
|
+
The extended_stats aggregation method calculates several statistical measures of a numeric field, including the count, min, max, sum, average, sum of squares, variance, standard deviation, and bounds. For example, if you want to calculate these measures for the age field, you can do this:
|
90
|
+
|
91
|
+
```ruby
|
92
|
+
Profile.extended_stats(:age_stats, field: :age)
|
93
|
+
```
|
94
|
+
|
95
|
+
### Date Range
|
96
|
+
|
97
|
+
The date_range aggregation method groups documents by whether their date field falls within specified ranges. For example, if you want to count how many profiles were created before and after a certain date, you can do this:
|
98
|
+
```ruby
|
99
|
+
Profile.date_range(:created_at_range, field: :created_at, ranges: [{ to: '2022-01-01' }, { from: '2022-01-01' }])
|
100
|
+
```
|
101
|
+
|
102
|
+
### Significant Terms
|
103
|
+
|
104
|
+
The significant_terms aggregation method finds the terms that appear more often in the documents that match your query than in the documents that don't. For example, if you want to find the tags that are significantly associated with profiles that have a status of "active", you can do this:
|
105
|
+
```ruby
|
106
|
+
Profile.where(status: 'active').significant_terms(:significant_tags, field: :tags)
|
107
|
+
```
|
108
|
+
In these examples, the first argument to the aggregation method is the name of the aggregation, and the second argument is a hash that specifies the field to aggregate on and other options. The exact options depend on the aggregation method.
|
109
|
+
|
110
|
+
## Nested Aggregations
|
111
|
+
|
112
|
+
Elasticsearch supports complex aggregations by allowing you to nest sub-aggregations within top-level aggregations. These sub-aggregations operate within the context of the parent aggregation, allowing you to refine and group your data in various ways.
|
113
|
+
|
114
|
+
There are three main types of aggregations in Elasticsearch: bucket, metric, and pipeline aggregations.
|
115
|
+
|
116
|
+
#### Bucket Aggregations
|
117
|
+
These aggregations create buckets or sets of documents based on certain criteria. Examples include `terms`, `date_histogram`, `range`, and `significant_terms` aggregations. Each bucket effectively defines a document set, and any sub-aggregations operate within the context of that set.
|
118
|
+
|
119
|
+
For example, you could use a terms aggregation to group documents by the status field, and then use a sub-aggregation to calculate the average age within each status group:
|
120
|
+
```ruby
|
121
|
+
Profile.aggregation(:status_avg_age, terms: { field: :status }, aggs: { avg_age: { avg: { field: :age } } })
|
122
|
+
```
|
123
|
+
|
124
|
+
#### Metric Aggregations
|
125
|
+
These aggregations perform calculations on the documents in each bucket, producing a single numeric result. Examples include `avg`, `sum`, `min`, `max`, and `extended_stats`.
|
126
|
+
|
127
|
+
For example, you could use a terms aggregation to group documents by the status field, and then use a max sub-aggregation to find the maximum age within each status group:
|
128
|
+
```ruby
|
129
|
+
Profile.aggregation(:status_max_age, terms: { field: :status }, aggs: { max_age: { max: { field: :age } } })
|
130
|
+
```
|
131
|
+
|
132
|
+
#### Pipeline Aggregations
|
133
|
+
These aggregations perform calculations on the results of other aggregations, allowing you to create complex summaries of your data. Examples include `avg_bucket`, `sum_bucket`, `min_bucket`, `max_bucket`, and `stats_bucket`.
|
134
|
+
|
135
|
+
For example, you could use a date_histogram aggregation to count documents by month, and then use a sum_bucket sub-aggregation to calculate the total count over all months:
|
136
|
+
```ruby
|
137
|
+
Profile.aggregation(:total_count_over_time, date_histogram: { field: :created_at, interval: 'month' }, aggs: { total_count: { sum_bucket: { buckets_path: '_count' } } })
|
138
|
+
# or
|
139
|
+
Profile.date_histogram(:total_count_over_time, {field: :created_at, interval: :month}, aggs: {total_count: { sum_bucket: { buckets_path: '_count' } } })
|
140
|
+
```
|
141
|
+
|
142
|
+
In these examples, the aggs option is used to define sub-aggregations. The key is the name of the sub-aggregation, and the value is a hash that defines the sub-aggregation.
|
@@ -0,0 +1,154 @@
|
|
1
|
+
# Machine Learning
|
2
|
+
>[!NOTE|style:flat|label:OpenSearch Compatability]
|
3
|
+
> OpenSearch and Elasticsearch diverge in how they handle machine learning APIs. These features are in active development and subject to change.
|
4
|
+
>
|
5
|
+
> This guide largely covers OpenSearch Machine Learning unless otherwise stated.
|
6
|
+
|
7
|
+
>[!WARNING|label:Machine Learning on Elasticsearch]
|
8
|
+
> Elasticsearch requires a license to enable ML capabilities
|
9
|
+
|
10
|
+
## Models
|
11
|
+
Machine Learning models follow a specific convention for storing model definitions. This helps us keep our code organized and easy to navigate.
|
12
|
+
|
13
|
+
- *app/machine_learning/models/example_machine_learning_model.rb*
|
14
|
+
|
15
|
+
A `MachineLearningModel` consists of the following components:
|
16
|
+
|
17
|
+
```ruby
|
18
|
+
class SparseEncodingModel < Stretchy::MachineLearning::Model
|
19
|
+
model: :neural_sparse_encoding,
|
20
|
+
version: '1.0.1',
|
21
|
+
model_format: 'TORCH_SCRIPT',
|
22
|
+
description: 'Creates sparse embedding for onboarding docs'
|
23
|
+
end
|
24
|
+
```
|
25
|
+
- `model:` This is the name of the model. It should match one of the pre-trained models available in your application. In this case, it's :neural_sparse_encoding.
|
26
|
+
|
27
|
+
- `version:` This is the version of the model. It's important to specify this, as different versions of the same model may have different behaviors or requirements.
|
28
|
+
|
29
|
+
- `model_format:` This is the format of the model. It tells Stretchy how to interpret the model file. In this case, it's 'TORCH_SCRIPT', which means the model is a TorchScript file. TorchScript is a way to serialize PyTorch models.
|
30
|
+
|
31
|
+
- `description:` This is a brief description of what the model does. It's a good practice to provide a meaningful description so that others can understand the purpose of your model at a glance. In this case, the description is 'Creates sparse embedding for onboarding docs'.
|
32
|
+
|
33
|
+
|
34
|
+
|
35
|
+
## Managing Models
|
36
|
+
|
37
|
+
>[!TIP|label:Machine Learning Settings]
|
38
|
+
> When running development or single-node clusters you may need to adjust your cluster settings to allow Machine Learning models to run on all nodes instead of dedicated machine learning nodes.
|
39
|
+
> Add `Stretchy::MachineLearning::Model.ml_on_all_nodes!` to your *config/environments/development.rb* file to enable machine learning on all nodes.
|
40
|
+
|
41
|
+
### register
|
42
|
+
Registers the machine learning model.
|
43
|
+
|
44
|
+
```ruby
|
45
|
+
MyMachineLearningModel.register
|
46
|
+
```
|
47
|
+
The `register` operation is asynchronous and can take some time to complete. To wait until the operation is complete use the helper method `wait_until_complete` in combination with the `registered?` method:
|
48
|
+
```ruby
|
49
|
+
MyMachineLearningModel.register do |model|
|
50
|
+
model.wait_until_complete do
|
51
|
+
model.registered?
|
52
|
+
end
|
53
|
+
end
|
54
|
+
```
|
55
|
+
|
56
|
+
### registered?
|
57
|
+
Checks the model status and returns true if `model_id` is present and `state` is `COMPLETED`
|
58
|
+
|
59
|
+
```ruby
|
60
|
+
MyMachineLearningModel.registered?
|
61
|
+
```
|
62
|
+
|
63
|
+
### status
|
64
|
+
Returns the status of the model registration
|
65
|
+
|
66
|
+
```ruby
|
67
|
+
MyMachineLearningModel.status
|
68
|
+
```
|
69
|
+
|
70
|
+
### deploy
|
71
|
+
Deploys the model making it available for use. Requires the model to be registered.
|
72
|
+
|
73
|
+
```ruby
|
74
|
+
MyMachineLearningModel.deploy
|
75
|
+
```
|
76
|
+
|
77
|
+
The `deploy` operation is asynchronous and can take some time to complete. Use the `wait_until_complete` method in combination with `deployed?` to wait until the model is deployed.
|
78
|
+
|
79
|
+
```ruby
|
80
|
+
MyMachineLearningModel.deploy do |model|
|
81
|
+
model.wait_until_complete(sleep_time: 5) do
|
82
|
+
model.deployed?
|
83
|
+
end
|
84
|
+
end
|
85
|
+
```
|
86
|
+
|
87
|
+
### undeploy
|
88
|
+
Undeploys the model.
|
89
|
+
|
90
|
+
```ruby
|
91
|
+
MyMachineLearningModel.undeploy
|
92
|
+
```
|
93
|
+
|
94
|
+
### deployed?
|
95
|
+
Gets the model and checks if `model_state` is `DEPLOYED`
|
96
|
+
|
97
|
+
```ruby
|
98
|
+
MyMachineLearningModel.deployed?
|
99
|
+
```
|
100
|
+
|
101
|
+
### delete
|
102
|
+
Deletes the model. The model must be undeployed before it can be deleted.
|
103
|
+
|
104
|
+
```ruby
|
105
|
+
MyMachineLearningModel.delete
|
106
|
+
```
|
107
|
+
|
108
|
+
### wait_until_complete
|
109
|
+
A helper to provide wait for completion of async tasks. Accepts `max_attempts` and `sleep_time`.
|
110
|
+
|
111
|
+
```ruby
|
112
|
+
MyMLModel.register do |model|
|
113
|
+
model.wait_until_complete(max_attempts: 20, sleep_time: 4) do
|
114
|
+
# finish waiting if last statement is true
|
115
|
+
model.registered?
|
116
|
+
end
|
117
|
+
end
|
118
|
+
```
|
119
|
+
|
120
|
+
### all
|
121
|
+
Returns all registered models.
|
122
|
+
|
123
|
+
```ruby
|
124
|
+
MyMLModel.all
|
125
|
+
```
|
126
|
+
|
127
|
+
## Pre-trained models
|
128
|
+
OpenSearch provides a variety of pre-trained models for different tasks:
|
129
|
+
|
130
|
+
### Neural Sparse Models
|
131
|
+
- `:neural_sparse_encoding` - 'amazon/neural-sparse/opensearch-neural-sparse-encoding-v1'
|
132
|
+
- `:neural_sparse_encoding_doc` - 'amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1'
|
133
|
+
- `:neural_sparse_tokenizer` - 'amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1'
|
134
|
+
|
135
|
+
### Cross Encoder Models
|
136
|
+
- `:cross_encoder_minilm_6` - 'huggingface/cross-encoders/ms-marco-MiniLM-L-6-v2'
|
137
|
+
- `:cross_encoder_minilm_12` - 'huggingface/cross-encoders/ms-marco-MiniLM-L-12-v2'
|
138
|
+
|
139
|
+
### Sentence Transformer Models
|
140
|
+
- `:sentence_transformers_roberta_all` - 'huggingface/sentence-transformers/all-distilroberta-v1'
|
141
|
+
- `:sentence_transformers_msmarco` - 'huggingface/sentence-transformers/msmarco-distilroberta-base-v2'
|
142
|
+
- `:sentence_transformers_minilm_6` - 'huggingface/sentence-transformers/all-MiniLM-L6-v2'
|
143
|
+
- `:sentence_transformers_minilm_12` - 'huggingface/sentence-transformers/all-MiniLM-L12-v2'
|
144
|
+
- `:sentence_transformers_mpnet` - 'huggingface/sentence-transformers/all-mpnet-base-v'
|
145
|
+
- `:sentence_transformers_multi_qa_minilm_6` - 'huggingface/sentence-transformers/multi-qa-MiniLM-L6-cos-v1'
|
146
|
+
- `:sentence_transformers_multi_qa_mpnet` - 'huggingface/sentence-transformers/multi-qa-mpnet-base-dot-v1'
|
147
|
+
- `:sentence_transformers_paraphrase_minilm_3` - 'huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2'
|
148
|
+
- `:sentence_transformers_paraphrase_multilingual_minilm_12` - 'huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
|
149
|
+
- `:sentence_transformers_paraphrase_mpnet` - 'huggingface/sentence-transformers/paraphrase-mpnet-base-v2'
|
150
|
+
- `:sentence_transformers_multilingual_distiluse_cased` - 'huggingface/sentence-transformers/distiluse-base-multilingual-cased-v1'
|
151
|
+
|
152
|
+
## Custom Models
|
153
|
+
|
154
|
+
Refer to the OpenSearch documentation on [deploying custom local models](https://opensearch.org/docs/latest/ml-commons-plugin/custom-local-models/)
|