stretchy-model 0.6.5 → 0.6.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (165) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +2 -1
  3. data/README.md +28 -10
  4. data/Rakefile +56 -0
  5. data/docs/.nojekyll +0 -0
  6. data/docs/README.md +147 -0
  7. data/docs/_coverpage.md +14 -0
  8. data/docs/_sidebar.md +14 -0
  9. data/docs/examples/_sidebar.md +15 -0
  10. data/docs/examples/data_analysis.md +216 -0
  11. data/docs/examples/semantic_search_with_llm.md +83 -0
  12. data/docs/examples/simple-ingest-pipeline.md +326 -0
  13. data/docs/guides/_sidebar.md +14 -0
  14. data/docs/guides/aggregations.md +142 -0
  15. data/docs/guides/machine-learning.md +154 -0
  16. data/docs/guides/models.md +372 -0
  17. data/docs/guides/pipelines.md +151 -0
  18. data/docs/guides/querying.md +361 -0
  19. data/docs/guides/quick-start.md +72 -0
  20. data/docs/guides/scopes.md +125 -0
  21. data/docs/index.html +113 -0
  22. data/docs/stretchy.cover.png +0 -0
  23. data/docs/stretchy.logo.png +0 -0
  24. data/docs/styles.css +90 -0
  25. data/lib/stretchy/attributes/transformers/keyword_transformer.rb +41 -35
  26. data/lib/stretchy/attributes/type/array.rb +24 -1
  27. data/lib/stretchy/attributes/type/base.rb +6 -2
  28. data/lib/stretchy/attributes/type/binary.rb +24 -17
  29. data/lib/stretchy/attributes/type/boolean.rb +29 -22
  30. data/lib/stretchy/attributes/type/completion.rb +18 -10
  31. data/lib/stretchy/attributes/type/constant_keyword.rb +35 -26
  32. data/lib/stretchy/attributes/type/date_time.rb +28 -17
  33. data/lib/stretchy/attributes/type/dense_vector.rb +46 -49
  34. data/lib/stretchy/attributes/type/flattened.rb +28 -19
  35. data/lib/stretchy/attributes/type/geo_point.rb +21 -12
  36. data/lib/stretchy/attributes/type/geo_shape.rb +21 -12
  37. data/lib/stretchy/attributes/type/hash.rb +24 -10
  38. data/lib/stretchy/attributes/type/histogram.rb +25 -0
  39. data/lib/stretchy/attributes/type/ip.rb +26 -17
  40. data/lib/stretchy/attributes/type/join.rb +16 -7
  41. data/lib/stretchy/attributes/type/keyword.rb +21 -26
  42. data/lib/stretchy/attributes/type/knn_vector.rb +47 -0
  43. data/lib/stretchy/attributes/type/match_only_text.rb +22 -1
  44. data/lib/stretchy/attributes/type/nested.rb +16 -11
  45. data/lib/stretchy/attributes/type/numeric/base.rb +30 -22
  46. data/lib/stretchy/attributes/type/numeric/byte.rb +20 -0
  47. data/lib/stretchy/attributes/type/numeric/double.rb +20 -0
  48. data/lib/stretchy/attributes/type/numeric/float.rb +20 -0
  49. data/lib/stretchy/attributes/type/numeric/half_float.rb +20 -0
  50. data/lib/stretchy/attributes/type/numeric/integer.rb +21 -1
  51. data/lib/stretchy/attributes/type/numeric/long.rb +20 -0
  52. data/lib/stretchy/attributes/type/numeric/scaled_float.rb +16 -7
  53. data/lib/stretchy/attributes/type/numeric/short.rb +20 -0
  54. data/lib/stretchy/attributes/type/numeric/unsigned_long.rb +21 -1
  55. data/lib/stretchy/attributes/type/percolator.rb +16 -4
  56. data/lib/stretchy/attributes/type/point.rb +19 -9
  57. data/lib/stretchy/attributes/type/range/base.rb +24 -1
  58. data/lib/stretchy/attributes/type/range/date_range.rb +21 -5
  59. data/lib/stretchy/attributes/type/range/double_range.rb +20 -4
  60. data/lib/stretchy/attributes/type/range/float_range.rb +21 -5
  61. data/lib/stretchy/attributes/type/range/integer_range.rb +20 -4
  62. data/lib/stretchy/attributes/type/range/ip_range.rb +20 -4
  63. data/lib/stretchy/attributes/type/range/long_range.rb +20 -4
  64. data/lib/stretchy/attributes/type/rank_feature.rb +16 -6
  65. data/lib/stretchy/attributes/type/rank_features.rb +16 -9
  66. data/lib/stretchy/attributes/type/search_as_you_type.rb +28 -18
  67. data/lib/stretchy/attributes/type/shape.rb +19 -9
  68. data/lib/stretchy/attributes/type/sparse_vector.rb +25 -21
  69. data/lib/stretchy/attributes/type/string.rb +42 -1
  70. data/lib/stretchy/attributes/type/text.rb +53 -28
  71. data/lib/stretchy/attributes/type/token_count.rb +21 -11
  72. data/lib/stretchy/attributes/type/version.rb +16 -6
  73. data/lib/stretchy/attributes/type/wildcard.rb +36 -25
  74. data/lib/stretchy/attributes.rb +29 -0
  75. data/lib/stretchy/delegation/gateway_delegation.rb +78 -0
  76. data/lib/stretchy/index_setting.rb +94 -0
  77. data/lib/stretchy/indexing/bulk.rb +75 -3
  78. data/lib/stretchy/model/callbacks.rb +1 -0
  79. data/lib/stretchy/model/common.rb +157 -0
  80. data/lib/stretchy/model/persistence.rb +144 -0
  81. data/lib/stretchy/model/refreshable.rb +26 -0
  82. data/lib/stretchy/pipeline.rb +2 -1
  83. data/lib/stretchy/pipelines/processor.rb +38 -36
  84. data/lib/stretchy/querying.rb +7 -8
  85. data/lib/stretchy/record.rb +5 -4
  86. data/lib/stretchy/relation.rb +229 -28
  87. data/lib/stretchy/relations/aggregation_methods/aggregation.rb +59 -0
  88. data/lib/stretchy/relations/aggregation_methods/avg.rb +45 -0
  89. data/lib/stretchy/relations/aggregation_methods/bucket_script.rb +47 -0
  90. data/lib/stretchy/relations/aggregation_methods/bucket_selector.rb +47 -0
  91. data/lib/stretchy/relations/aggregation_methods/bucket_sort.rb +47 -0
  92. data/lib/stretchy/relations/aggregation_methods/cardinality.rb +47 -0
  93. data/lib/stretchy/relations/aggregation_methods/children.rb +47 -0
  94. data/lib/stretchy/relations/aggregation_methods/composite.rb +41 -0
  95. data/lib/stretchy/relations/aggregation_methods/date_histogram.rb +53 -0
  96. data/lib/stretchy/relations/aggregation_methods/date_range.rb +53 -0
  97. data/lib/stretchy/relations/aggregation_methods/extended_stats.rb +48 -0
  98. data/lib/stretchy/relations/aggregation_methods/filter.rb +47 -0
  99. data/lib/stretchy/relations/aggregation_methods/filters.rb +47 -0
  100. data/lib/stretchy/relations/aggregation_methods/geo_bounds.rb +40 -0
  101. data/lib/stretchy/relations/aggregation_methods/geo_centroid.rb +40 -0
  102. data/lib/stretchy/relations/aggregation_methods/global.rb +39 -0
  103. data/lib/stretchy/relations/aggregation_methods/histogram.rb +43 -0
  104. data/lib/stretchy/relations/aggregation_methods/ip_range.rb +41 -0
  105. data/lib/stretchy/relations/aggregation_methods/max.rb +40 -0
  106. data/lib/stretchy/relations/aggregation_methods/min.rb +41 -0
  107. data/lib/stretchy/relations/aggregation_methods/missing.rb +40 -0
  108. data/lib/stretchy/relations/aggregation_methods/nested.rb +40 -0
  109. data/lib/stretchy/relations/aggregation_methods/percentile_ranks.rb +45 -0
  110. data/lib/stretchy/relations/aggregation_methods/percentiles.rb +45 -0
  111. data/lib/stretchy/relations/aggregation_methods/range.rb +42 -0
  112. data/lib/stretchy/relations/aggregation_methods/reverse_nested.rb +40 -0
  113. data/lib/stretchy/relations/aggregation_methods/sampler.rb +40 -0
  114. data/lib/stretchy/relations/aggregation_methods/scripted_metric.rb +43 -0
  115. data/lib/stretchy/relations/aggregation_methods/significant_terms.rb +45 -0
  116. data/lib/stretchy/relations/aggregation_methods/stats.rb +42 -0
  117. data/lib/stretchy/relations/aggregation_methods/sum.rb +42 -0
  118. data/lib/stretchy/relations/aggregation_methods/terms.rb +46 -0
  119. data/lib/stretchy/relations/aggregation_methods/top_hits.rb +42 -0
  120. data/lib/stretchy/relations/aggregation_methods/top_metrics.rb +44 -0
  121. data/lib/stretchy/relations/aggregation_methods/value_count.rb +41 -0
  122. data/lib/stretchy/relations/aggregation_methods/weighted_avg.rb +42 -0
  123. data/lib/stretchy/relations/aggregation_methods.rb +20 -749
  124. data/lib/stretchy/relations/finder_methods.rb +2 -18
  125. data/lib/stretchy/relations/null_relation.rb +55 -0
  126. data/lib/stretchy/relations/query_builder.rb +82 -36
  127. data/lib/stretchy/relations/query_methods/bind.rb +19 -0
  128. data/lib/stretchy/relations/query_methods/extending.rb +29 -0
  129. data/lib/stretchy/relations/query_methods/fields.rb +70 -0
  130. data/lib/stretchy/relations/query_methods/filter_query.rb +53 -0
  131. data/lib/stretchy/relations/query_methods/has_field.rb +40 -0
  132. data/lib/stretchy/relations/query_methods/highlight.rb +75 -0
  133. data/lib/stretchy/relations/query_methods/hybrid.rb +60 -0
  134. data/lib/stretchy/relations/query_methods/ids.rb +40 -0
  135. data/lib/stretchy/relations/query_methods/match.rb +52 -0
  136. data/lib/stretchy/relations/query_methods/must_not.rb +54 -0
  137. data/lib/stretchy/relations/query_methods/neural.rb +58 -0
  138. data/lib/stretchy/relations/query_methods/neural_sparse.rb +43 -0
  139. data/lib/stretchy/relations/query_methods/none.rb +21 -0
  140. data/lib/stretchy/relations/query_methods/or_filter.rb +21 -0
  141. data/lib/stretchy/relations/query_methods/order.rb +63 -0
  142. data/lib/stretchy/relations/query_methods/query_string.rb +44 -0
  143. data/lib/stretchy/relations/query_methods/regexp.rb +61 -0
  144. data/lib/stretchy/relations/query_methods/should.rb +51 -0
  145. data/lib/stretchy/relations/query_methods/size.rb +44 -0
  146. data/lib/stretchy/relations/query_methods/skip_callbacks.rb +47 -0
  147. data/lib/stretchy/relations/query_methods/source.rb +59 -0
  148. data/lib/stretchy/relations/query_methods/where.rb +113 -0
  149. data/lib/stretchy/relations/query_methods.rb +48 -569
  150. data/lib/stretchy/relations/scoping/default.rb +136 -0
  151. data/lib/stretchy/relations/scoping/named.rb +70 -0
  152. data/lib/stretchy/relations/scoping/scope_registry.rb +36 -0
  153. data/lib/stretchy/relations/scoping.rb +30 -0
  154. data/lib/stretchy/relations/search_option_methods.rb +2 -0
  155. data/lib/stretchy/version.rb +1 -1
  156. data/lib/stretchy.rb +17 -10
  157. metadata +111 -17
  158. data/lib/stretchy/common.rb +0 -38
  159. data/lib/stretchy/null_relation.rb +0 -53
  160. data/lib/stretchy/persistence.rb +0 -43
  161. data/lib/stretchy/refreshable.rb +0 -15
  162. data/lib/stretchy/scoping/default.rb +0 -134
  163. data/lib/stretchy/scoping/named.rb +0 -68
  164. data/lib/stretchy/scoping/scope_registry.rb +0 -34
  165. data/lib/stretchy/scoping.rb +0 -28
@@ -0,0 +1,326 @@
1
+ # Simple Ingest Pipeline
2
+
3
+ >[!INFO|style:flat:label:Prerequisites]
4
+ >
5
+ >- Opensearch 2.12+ installed and running
6
+ >- Ruby on Rails
7
+ >
8
+ >Follow the [Quick Start](guides/quick-start) for detailed steps.
9
+
10
+ ## Data Source
11
+
12
+ Our data source is a JSON data representing vitals and patient information scraped from the dark web (jk). We have an id, vitals as a CSV, full name, age and a SSN with HTML tags. What a mess!
13
+
14
+
15
+ | id | vitals | name | age | ssn |
16
+ | ------- | ---------- | -------------------------- | --- | ------------------ |
17
+ | ta0j288 | 700,120,72 | Gov. Candy Williams | 30 | <b>547-93-4227</b> |
18
+ | ta0j288 | 56,120,72 | Romana Prohaska | 30 | <b>547-93-4227</b> |
19
+ | pnunl70 | 114,136,43 | Tristan Waelchi | 81 | <b>323-23-5997</b> |
20
+ | 8lhscax | 105,66,56 | Antoine Hauck | 46 | <b>381-54-5352</b> |
21
+ | impcbo9 | 119,78,60 | Dewayne Stark | 39 | <b>816-86-6698</b> |
22
+ | jxr8h3v | 81,69,58 | Shelton Powlowski | 77 | <b>810-63-7478</b> |
23
+ | d7lwaln | 103,140,93 | Sage Medhurst | 19 | <b>470-43-3841</b> |
24
+ | ryrtjb5 | 57,118,86 | Tobias Strosin | 76 | <b>197-25-4397</b> |
25
+ | ox227l3 | 82,103,98 | Jessi Barton | 41 | <b>700-41-0042</b> |
26
+ | c7vyqu2 | 103,73,90 | Eliseo Feest | 53 | <b>153-01-6678</b> |
27
+ | i8lbviz | 81,120,91 | The Hon. Zandra Dibbert MD | 55 | <b>881-10-7835</b> |
28
+
29
+ Our goal is to create an ingest pipeline using `stretchy-model` to process this data. The pipeline will transform the vitals from a CSV into an array, remove the HTML tags from the SSN, and split the full name into first and last names.
30
+
31
+ ## Define the Pipeline
32
+
33
+ An ingest pipeline in Elasticsearch allows us to pre-process documents before the actual document indexing occurs. It's a way to transform and enrich the data, making it more useful and easier to work with.
34
+
35
+ ```mermaid
36
+ flowchart LR
37
+
38
+ CSV --> SCRIPT
39
+
40
+ SCRIPT --> HTML_STRIP
41
+
42
+ HTML_STRIP --> CONVERT
43
+
44
+ CONVERT --> REMOVE
45
+
46
+ REMOVE --> INDEX
47
+
48
+ ```
49
+ By doing these transformations as part of the ingest process, we ensure that the data is in the right format and structure for our needs right from the moment it enters Elasticsearch. This makes our subsequent data handling and analysis tasks much easier and more efficient.
50
+
51
+ _app/pipelines/intake_form_pipeline.rb_
52
+
53
+ ```ruby
54
+ class IntakeFormPipeline < Stretchy::Pipeline
55
+
56
+ description "Ingests intake forms and scrubs ssn of html"
57
+
58
+ processor :csv,
59
+ field: :vitals,
60
+ target_fields: [:heart_rate, :systolic, :diastolic],
61
+ trim: true
62
+
63
+ processor :script,
64
+ description: "Extracts first and last name from name field",
65
+ lang: "painless",
66
+ source: <<~PAINLESS
67
+ ctx['name'] = /^[\\w\\s]+\\.\\s/.matcher(ctx['name']).replaceAll("");
68
+ String[] parts = /\\s+/.split(ctx['name']);
69
+ ctx['first_name'] = parts[0];
70
+ if (parts.length > 1) {
71
+ ctx['last_name'] = parts[1];
72
+ }
73
+ PAINLESS
74
+
75
+ processor :html_strip, field: :ssn
76
+ processor :convert, field: :systolic, type: :integer
77
+ processor :convert, field: :diastolic, type: :integer
78
+ processor :convert, field: :heart_rate, type: :integer
79
+
80
+ processor :remove, field: :name
81
+ processor :remove, field: :vitals
82
+
83
+ end
84
+ ```
85
+
86
+ The `IntakeFormPipeline` will preprocess documents that are sent to be indexed. We have a `description` and a series of `processor` statements, each performing a specific transformation on the data:
87
+
88
+ - **csv** - parse `vitals` and map them to `heart_rate`, `systolic` and `diastolic` fields
89
+ - **script** - split `name` into `first_name` and `last_name`, removing any titles like Dr., Rev. etc.
90
+ - **html_strip** - scrub `ssn` of any HTML tags
91
+ - **convert** - ensure vitals are all integers
92
+ - **remove** - remove the fields we no longer need
93
+
94
+ **Create the pipeline:**
95
+
96
+ This command sends a request to the Elasticsearch server to create a new ingest pipeline with the specifications defined in the IntakeFormPipeline class.
97
+
98
+ Once the pipeline is created, it's ready to preprocess any documents that are sent to be indexed. The transformations defined in the pipeline (such as parsing CSVs, splitting names, stripping HTML tags, and converting fields to integers) will be applied to each document before it's indexed.
99
+
100
+ ```ruby
101
+ IntakeFormPipeline.create!
102
+ ```
103
+
104
+ **Response:**
105
+ ```ruby
106
+ #=> {"acknowledged"=>true}
107
+ ```
108
+
109
+ Remember, the pipeline only needs to be created once. After it's created, it will be used automatically whenever documents are indexed in Elasticsearch. If you need to change the pipeline, you can remove it with the `IntakeFormPipeline.delete! command.
110
+
111
+ ## Describe the Model
112
+
113
+ The `IntakeForm` model represents the index were we’ll store our intake forms.
114
+
115
+ *app/models/intake_form.rb*
116
+
117
+ ```ruby
118
+ class IntakeForm < StretchyModel
119
+ attribute :first_name, :keyword
120
+ attribute :last_name, :keyword
121
+ attribute :ssn, :keyword
122
+ attribute :age, :integer
123
+ attribute :heart_rate, :integer
124
+ attribute :systolic, :integer
125
+ attribute :diastolic, :integer
126
+
127
+ default_pipeline :intake_form_pipeline
128
+ end
129
+ ```
130
+ The IntakeForm model inherits from `StretchyModel`, which means it gets all the functionality provided by `StretchyModel`.
131
+
132
+ The `attribute` method is used to define the fields of the IntakeForm model. Each `attribute` has a name and a type. The type corresponds to the Elasticsearch field type.
133
+
134
+ The `default_pipeline` method sets the default ingest pipeline for the model. In this case, it's set to `:intake_form_pipeline`, which means that the intake_form_pipeline will be used to preprocess documents before they are indexed.
135
+
136
+
137
+ **Create the index:**
138
+
139
+ ```ruby
140
+ IntakeForm.create_index!
141
+ ```
142
+
143
+ **Response:**
144
+ ```ruby
145
+ #=> {"acknowledged"=>true, "shards_acknowledged"=>true, "index"=>"intake_forms"}
146
+ ```
147
+
148
+ ## Run the pipeline
149
+
150
+ To run the pipeline, you'll need to index documents using the `IntakeForm` model. The `default_pipeline` you set earlier will automatically preprocess the documents before they are indexed.
151
+
152
+ ```ruby
153
+ initial_data = [
154
+ {"id": "ta0j288", "vitals": "700,120,72", "name": "Gov. Candy Williams", "age": 30, "ssn": "<b>547-93-4227</b>"},
155
+ {"id": "ta0j288", "vitals": "56,120,72", "name": "Romana Prohaska", "age": 30, "ssn": "<b>547-93-4227</b>"},
156
+ {"id": "pnunl70", "vitals": "114,136,43", "name": "Tristan Waelchi", "age": 81, "ssn": "<b>323-23-5997</b>"},
157
+ {"id": "8lhscax", "vitals": "105,66,56", "name": "Antoine Hauck", "age": 46, "ssn": "<b>381-54-5352</b>"},
158
+ {"id": "impcbo9", "vitals": "119,78,60", "name": "Dewayne Stark", "age": 39, "ssn": "<b>816-86-6698</b>"},
159
+ {"id": "jxr8h3v", "vitals": "81,69,58", "name": "Shelton Powlowski", "age": 77, "ssn": "<b>810-63-7478</b>"},
160
+ {"id": "d7lwaln", "vitals": "103,140,93", "name": "Sage Medhurst", "age": 19, "ssn": "<b>470-43-3841</b>"},
161
+ {"id": "ryrtjb5", "vitals": "57,118,86", "name": "Tobias Strosin", "age": 76, "ssn": "<b>197-25-4397</b>"},
162
+ {"id": "ox227l3", "vitals": "82,103,98", "name": "Jessi Barton", "age": 41, "ssn": "<b>700-41-0042</b>"},
163
+ {"id": "c7vyqu2", "vitals": "103,73,90", "name": "Eliseo Feest", "age": 53, "ssn": "<b>153-01-6678</b>"},
164
+ {"id": "i8lbviz", "vitals": "81,120,91", "name": "The Hon. Zandra Dibbert MD", "age": 55, "ssn": "<b>881-10-7835</b>"}
165
+ ]
166
+ ```
167
+
168
+ #### Simulate
169
+ We can simulate `IntakeFormPipeline` to make sure it works as expected.
170
+
171
+ ```ruby
172
+ docs = initial_data.map {|doc| {_source: doc} }
173
+ ```
174
+
175
+ We prepare the initial data by making sure each entry has a `_source` field with the document as the value. This is slightly different than how we'll prepare the data for actual indexing.
176
+
177
+ **Simulate the pipeline:**
178
+
179
+ ```ruby
180
+ IntakeFormPipeline.simulate(docs)
181
+ ```
182
+
183
+ **Response:**
184
+ ```ruby
185
+ #=> {"docs"=>
186
+ # [{"processor_results"=>
187
+ # [{"processor_type"=>"csv",
188
+ # "status"=>"success",
189
+ # "doc"=>
190
+ # {"_index"=>"_index",
191
+ # "_type"=>"_doc",
192
+ # "_id"=>"_id",
193
+ # "_source"=>{"systolic"=>"120", "diastolic"=>"72", "name"=>"Gov. Candy Williams", #"heart_rate"=>"700", "id"=>"ta0j288", "vitals"=>"700,120,72", "age"=>30, "ssn"=>"<b>547-93-4227</#b>"},
194
+ # "_ingest"=>{"pipeline"=>"intake_form_pipeline", "timestamp"=>"2024-03-20T13:06:17.661745464Z"}}},
195
+ # {"processor_type"=>"script",
196
+ # "status"=>"success",
197
+ # "description"=>"Extracts first and last name from name field",
198
+ # "doc"=>
199
+ # {"_index"=>"_index",
200
+ # "_type"=>"_doc",
201
+ # "_id"=>"_id",
202
+ # "_source"=>{"heart_rate"=>"700", "last_name"=>"Williams", "ssn"=>"<b>547-93-4227</b>", #"systolic"=>"120", "diastolic"=>"72", "name"=>"Candy Williams", "id"=>"ta0j288", #"first_name"=>"Candy", "vitals"=>"700,120,72", "age"=>30},
203
+ # "_ingest"=>{"pipeline"=>"intake_form_pipeline", "timestamp"=>"2024-03-20T13:06:17.661745464Z"}}},
204
+ # ...
205
+ ```
206
+
207
+ The response should show the results of simulation, with each processor step and it’s status.
208
+
209
+ #### Ingest
210
+
211
+ Now, let’s ingest the data into the index. We’ll use a bulk request to index our documents:
212
+
213
+ ```ruby
214
+ bulk_records = initial_data.map do |data|
215
+ { index: { _index: IntakeForm.index_name, data: data } }
216
+ end
217
+
218
+ IntakeForm.bulk(bulk_records)
219
+ ```
220
+
221
+ **Response:**
222
+ ```ruby
223
+ =>
224
+ {"took"=>3,
225
+ "ingest_took"=>1,
226
+ "errors"=>false,
227
+ "items"=>
228
+ [{"index"=>{"_index"=>"intake_forms", "_type"=>"_doc", "_id"=>"vzz8W44BuORXSU88oIRr", "_version"=>1, "result"=>"created", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>0, "_primary_term"=>1, "status"=>201}},
229
+ {"index"=>{"_index"=>"intake_forms", "_type"=>"_doc", "_id"=>"wDz8W44BuORXSU88oIRr", "_version"=>1, "result"=>"created", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>1, "_primary_term"=>1, "status"=>201}},
230
+ {"index"=>{"_index"=>"intake_forms", "_type"=>"_doc", "_id"=>"wTz8W44BuORXSU88oIRr", "_version"=>1, "result"=>"created", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>2, "_primary_term"=>1, "status"=>201}},
231
+ {"index"=>{"_index"=>"intake_forms", "_type"=>"_doc", "_id"=>"wjz8W44BuORXSU88oIRr", "_version"=>1, "result"=>"created", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>3, "_primary_term"=>1, "status"=>201}},
232
+ {"index"=>{"_index"=>"intake_forms", "_type"=>"_doc", "_id"=>"wzz8W44BuORXSU88oIRr", "_version"=>1, "result"=>"created", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>4, "_primary_term"=>1, "status"=>201}},
233
+ {"index"=>{"_index"=>"intake_forms", "_type"=>"_doc", "_id"=>"xDz8W44BuORXSU88oIRr", "_version"=>1, "result"=>"created", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>5, "_primary_term"=>1, "status"=>201}},
234
+ {"index"=>{"_index"=>"intake_forms", "_type"=>"_doc", "_id"=>"xTz8W44BuORXSU88oIRr", "_version"=>1, "result"=>"created", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>6, "_primary_term"=>1, "status"=>201}},
235
+ {"index"=>{"_index"=>"intake_forms", "_type"=>"_doc", "_id"=>"xjz8W44BuORXSU88oIRr", "_version"=>1, "result"=>"created", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>7, "_primary_term"=>1, "status"=>201}},
236
+ {"index"=>{"_index"=>"intake_forms", "_type"=>"_doc", "_id"=>"xzz8W44BuORXSU88oIRr", "_version"=>1, "result"=>"created", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>8, "_primary_term"=>1, "status"=>201}},
237
+ {"index"=>{"_index"=>"intake_forms", "_type"=>"_doc", "_id"=>"yDz8W44BuORXSU88oIRr", "_version"=>1, "result"=>"created", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>9, "_primary_term"=>1, "status"=>201}},
238
+ {"index"=>{"_index"=>"intake_forms", "_type"=>"_doc", "_id"=>"yTz8W44BuORXSU88oIRr", "_version"=>1, "result"=>"created", "_shards"=>{"total"=>2, "successful"=>1, "failed"=>0}, "_seq_no"=>10, "_primary_term"=>1, "status"=>201}}]}
239
+
240
+ ```
241
+
242
+ Our ingest pipeline will perform all of the operations we defined as `processors` in the `IntakeFormPipeline` and index the resulting document.
243
+
244
+ Let's see how it did:
245
+
246
+ ```ruby
247
+ IntakeForm.count
248
+ #=> 11
249
+
250
+ IntakeForm.first.heart_rate
251
+ #=> 700
252
+ ```
253
+
254
+ Wow! The Gov. must be having as much fun as us with a heart rate like that.
255
+
256
+ Let's get the average heart rate per age group:
257
+
258
+ ```ruby
259
+ results = IntakeForm.range(:ages, {
260
+ field: :age,
261
+ ranges: [
262
+ {from: 19, to: 39},
263
+ {from: 40, to: 59},
264
+ {from: 60, to: 79},
265
+ {from: 80}
266
+ ],
267
+ keyed: true
268
+ },
269
+ aggs: {avg_heart_rate: {avg: {field: :heart_rate}}}).size(0)
270
+
271
+ ap results.aggregations.ages.buckets
272
+ ```
273
+
274
+ **Response:**
275
+ ```ruby
276
+
277
+ {
278
+ "19.0-39.0" => {
279
+ "from" => 19.0,
280
+ "to" => 39.0,
281
+ "doc_count" => 3,
282
+ "avg_heart_rate" => {
283
+ "value" => 286.3333333333333
284
+ }
285
+ },
286
+ "40.0-59.0" => {
287
+ "from" => 40.0,
288
+ "to" => 59.0,
289
+ "doc_count" => 4,
290
+ "avg_heart_rate" => {
291
+ "value" => 92.75
292
+ }
293
+ },
294
+ "60.0-79.0" => {
295
+ "from" => 60.0,
296
+ "to" => 79.0,
297
+ "doc_count" => 2,
298
+ "avg_heart_rate" => {
299
+ "value" => 69.0
300
+ }
301
+ },
302
+ "80.0-*" => {
303
+ "from" => 80.0,
304
+ "doc_count" => 1,
305
+ "avg_heart_rate" => {
306
+ "value" => 114.0
307
+ }
308
+ }
309
+ }
310
+ ```
311
+
312
+ In this guide, we've walked through the process of creating an ingest pipeline with Elasticsearch using `stretchy-model`.
313
+
314
+ We started with a dataset of patient information, which included fields that needed preprocessing before indexing. We defined an ingest pipeline, `IntakeFormPipeline`, that transformed the data into a more useful format, including parsing CSVs, splitting names, removing HTML tags, and converting fields to integers.
315
+
316
+ We then used the `IntakeForm` model, which inherits from `StretchyModel`, to index the preprocessed data in Elasticsearch. We also demonstrated how to run aggregations on the indexed data to get insights, such as the average heart rate per age group.
317
+
318
+ This is a simple example, but ingest pipelines can be much more complex and powerful, allowing you to preprocess your data in many different ways before indexing. With `stretchy-model`, you can leverage the full power of Elasticsearch's ingest pipelines while writing Ruby code that feels familiar and idiomatic.
319
+
320
+ ## Cleaning up
321
+
322
+ ```ruby
323
+ IntakeForm.delete_index!
324
+ IntakeFormPipeline.delete!
325
+ ```
326
+
@@ -0,0 +1,14 @@
1
+ * [__Readme__](/)
2
+
3
+ * __Guides__
4
+ * [Quick Start](guides/quick-start?id=quick-start)
5
+ * [Models](guides/models?id=models)
6
+ * [Querying](guides/querying?id=querying)
7
+ * [Scopes](guides/scopes?id=scopes)
8
+ * [Aggregations](guides/aggregations?id=aggregations)
9
+ * [Pipelines](guides/pipelines?id=pipelines)
10
+ * [Machine Learning](guides/machine-learning?id=machine-learning)
11
+
12
+ * __Examples__
13
+ * [Data Analysis](examples/data_analysis)
14
+ * [Simple Ingest Pipeline](examples/simple-ingest-pipeline)
@@ -0,0 +1,142 @@
1
+ # Aggregations
2
+
3
+ Aggregations in Elasticsearch allow you to get summary information about your data. For example, you can use aggregations to count the number of records that match certain criteria, calculate the average value of a field, find the minimum or maximum value, and more.
4
+
5
+
6
+ When performing aggregations it's good practice to set `size(0)` if you don't need the source documents.
7
+
8
+ ```ruby
9
+ results = Profile.aggregation(:flagged_counts, terms: {field: :flagged}).size(0)
10
+ ```
11
+
12
+ Aggregation results are available on the results aggregations object by the name provided to the aggregation:
13
+
14
+ ```ruby
15
+ results.aggregations.flagged_counts
16
+ ```
17
+
18
+ returns:
19
+ ```ruby
20
+ {
21
+ "doc_count_error_upper_bound"=>0,
22
+ "sum_other_doc_count"=>0,
23
+ "buckets"=>[
24
+ {"key"=>"true", "doc_count"=>123},
25
+ {"key"=>"false", "doc_count"=>456}
26
+ ]
27
+ }
28
+ ```
29
+
30
+ >[!TIP|label:Accessing Aggregation Results]
31
+ >You can access the entire structure through dot notation.
32
+ >
33
+ >`aggregations.flagged_counts.buckets.first.doc_count` => `123`
34
+
35
+
36
+ ---
37
+
38
+ In Stretchy, you use the `aggregation` method to define aggregations. Here are some examples:
39
+
40
+ ### Count by status
41
+
42
+ if you have a `status` field and you want to count how many records there are for each status, you can use a terms aggregation:
43
+ ```ruby
44
+ profile.aggregation(:status_count, terms: { field: :status })
45
+ ```
46
+
47
+
48
+ ### Average Age
49
+
50
+ If you have an age field and you want to calculate the average age, you can do this:
51
+
52
+ ```ruby
53
+ Profile.aggregation(:average_age, avg: { field: :age })
54
+ ```
55
+
56
+ ### Minimum and Maximum Age
57
+
58
+ If you want to find the minimum and maximum age, you can do this:
59
+
60
+ ```ruby
61
+ Profile.aggregation(:min_age, min: { field: :age })
62
+ Profile.aggregation(:max_age, max: { field: :age })
63
+ ```
64
+
65
+ ### Date Histogram
66
+
67
+ If you have a `created_at` field and you want to count how many profiles were created in each month, you can do this:
68
+ ```ruby
69
+ Profile.aggregation(:profiles_over_time, date_histogram: { field: :created_at, interval: 'month' })
70
+ ```
71
+
72
+ In these examples, the first argument to the aggregation method is the name of the aggregation, and the second argument is a hash that defines the aggregation. The key of the hash is the type of the aggregation (terms, avg, min, max, or date_histogram), and the value is another hash that specifies the field to aggregate on and other options.
73
+
74
+ ## Named Aggregation Helpers
75
+
76
+ The above shows how to use the `aggregation` method directly, but Stretchy makes working with named aggregations even easier. Named aggregation helpers make calling the aggregation you want a breeze.
77
+
78
+ The documentation goes into depth for all available [aggregation types](/doc/stretchy/relations/AggregationMethods)
79
+
80
+ ### Percentiles
81
+
82
+ The percentiles aggregation method calculates the percentiles of a numeric field. For example, if you want to calculate the 25th, 50th, and 75th percentiles of the age field, you can do this:
83
+ ```ruby
84
+ Profile.percentiles(:age_percentiles, field: :age, percents: [25, 50, 75])
85
+ ```
86
+
87
+ ### Extended Stats
88
+
89
+ The extended_stats aggregation method calculates several statistical measures of a numeric field, including the count, min, max, sum, average, sum of squares, variance, standard deviation, and bounds. For example, if you want to calculate these measures for the age field, you can do this:
90
+
91
+ ```ruby
92
+ Profile.extended_stats(:age_stats, field: :age)
93
+ ```
94
+
95
+ ### Date Range
96
+
97
+ The date_range aggregation method groups documents by whether their date field falls within specified ranges. For example, if you want to count how many profiles were created before and after a certain date, you can do this:
98
+ ```ruby
99
+ Profile.date_range(:created_at_range, field: :created_at, ranges: [{ to: '2022-01-01' }, { from: '2022-01-01' }])
100
+ ```
101
+
102
+ ### Significant Terms
103
+
104
+ The significant_terms aggregation method finds the terms that appear more often in the documents that match your query than in the documents that don't. For example, if you want to find the tags that are significantly associated with profiles that have a status of "active", you can do this:
105
+ ```ruby
106
+ Profile.where(status: 'active').significant_terms(:significant_tags, field: :tags)
107
+ ```
108
+ In these examples, the first argument to the aggregation method is the name of the aggregation, and the second argument is a hash that specifies the field to aggregate on and other options. The exact options depend on the aggregation method.
109
+
110
+ ## Nested Aggregations
111
+
112
+ Elasticsearch supports complex aggregations by allowing you to nest sub-aggregations within top-level aggregations. These sub-aggregations operate within the context of the parent aggregation, allowing you to refine and group your data in various ways.
113
+
114
+ There are three main types of aggregations in Elasticsearch: bucket, metric, and pipeline aggregations.
115
+
116
+ #### Bucket Aggregations
117
+ These aggregations create buckets or sets of documents based on certain criteria. Examples include `terms`, `date_histogram`, `range`, and `significant_terms` aggregations. Each bucket effectively defines a document set, and any sub-aggregations operate within the context of that set.
118
+
119
+ For example, you could use a terms aggregation to group documents by the status field, and then use a sub-aggregation to calculate the average age within each status group:
120
+ ```ruby
121
+ Profile.aggregation(:status_avg_age, terms: { field: :status }, aggs: { avg_age: { avg: { field: :age } } })
122
+ ```
123
+
124
+ #### Metric Aggregations
125
+ These aggregations perform calculations on the documents in each bucket, producing a single numeric result. Examples include `avg`, `sum`, `min`, `max`, and `extended_stats`.
126
+
127
+ For example, you could use a terms aggregation to group documents by the status field, and then use a max sub-aggregation to find the maximum age within each status group:
128
+ ```ruby
129
+ Profile.aggregation(:status_max_age, terms: { field: :status }, aggs: { max_age: { max: { field: :age } } })
130
+ ```
131
+
132
+ #### Pipeline Aggregations
133
+ These aggregations perform calculations on the results of other aggregations, allowing you to create complex summaries of your data. Examples include `avg_bucket`, `sum_bucket`, `min_bucket`, `max_bucket`, and `stats_bucket`.
134
+
135
+ For example, you could use a date_histogram aggregation to count documents by month, and then use a sum_bucket sub-aggregation to calculate the total count over all months:
136
+ ```ruby
137
+ Profile.aggregation(:total_count_over_time, date_histogram: { field: :created_at, interval: 'month' }, aggs: { total_count: { sum_bucket: { buckets_path: '_count' } } })
138
+ # or
139
+ Profile.date_histogram(:total_count_over_time, {field: :created_at, interval: :month}, aggs: {total_count: { sum_bucket: { buckets_path: '_count' } } })
140
+ ```
141
+
142
+ In these examples, the aggs option is used to define sub-aggregations. The key is the name of the sub-aggregation, and the value is a hash that defines the sub-aggregation.
@@ -0,0 +1,154 @@
1
+ # Machine Learning
2
+ >[!NOTE|style:flat|label:OpenSearch Compatability]
3
+ > OpenSearch and Elasticsearch diverge in how they handle machine learning APIs. These features are in active development and subject to change.
4
+ >
5
+ > This guide largely covers OpenSearch Machine Learning unless otherwise stated.
6
+
7
+ >[!WARNING|label:Machine Learning on Elasticsearch]
8
+ > Elasticsearch requires a license to enable ML capabilities
9
+
10
+ ## Models
11
+ Machine Learning models follow a specific convention for storing model definitions. This helps us keep our code organized and easy to navigate.
12
+
13
+ - *app/machine_learning/models/example_machine_learning_model.rb*
14
+
15
+ A `MachineLearningModel` consists of the following components:
16
+
17
+ ```ruby
18
+ class SparseEncodingModel < Stretchy::MachineLearning::Model
19
+ model: :neural_sparse_encoding,
20
+ version: '1.0.1',
21
+ model_format: 'TORCH_SCRIPT',
22
+ description: 'Creates sparse embedding for onboarding docs'
23
+ end
24
+ ```
25
+ - `model:` This is the name of the model. It should match one of the pre-trained models available in your application. In this case, it's :neural_sparse_encoding.
26
+
27
+ - `version:` This is the version of the model. It's important to specify this, as different versions of the same model may have different behaviors or requirements.
28
+
29
+ - `model_format:` This is the format of the model. It tells Stretchy how to interpret the model file. In this case, it's 'TORCH_SCRIPT', which means the model is a TorchScript file. TorchScript is a way to serialize PyTorch models.
30
+
31
+ - `description:` This is a brief description of what the model does. It's a good practice to provide a meaningful description so that others can understand the purpose of your model at a glance. In this case, the description is 'Creates sparse embedding for onboarding docs'.
32
+
33
+
34
+
35
+ ## Managing Models
36
+
37
+ >[!TIP|label:Machine Learning Settings]
38
+ > When running development or single-node clusters you may need to adjust your cluster settings to allow Machine Learning models to run on all nodes instead of dedicated machine learning nodes.
39
+ > Add `Stretchy::MachineLearning::Model.ml_on_all_nodes!` to your *config/environments/development.rb* file to enable machine learning on all nodes.
40
+
41
+ ### register
42
+ Registers the machine learning model.
43
+
44
+ ```ruby
45
+ MyMachineLearningModel.register
46
+ ```
47
+ The `register` operation is asynchronous and can take some time to complete. To wait until the operation is complete use the helper method `wait_until_complete` in combination with the `registered?` method:
48
+ ```ruby
49
+ MyMachineLearningModel.register do |model|
50
+ model.wait_until_complete do
51
+ model.registered?
52
+ end
53
+ end
54
+ ```
55
+
56
+ ### registered?
57
+ Checks the model status and returns true if `model_id` is present and `state` is `COMPLETED`
58
+
59
+ ```ruby
60
+ MyMachineLearningModel.registered?
61
+ ```
62
+
63
+ ### status
64
+ Returns the status of the model registration
65
+
66
+ ```ruby
67
+ MyMachineLearningModel.status
68
+ ```
69
+
70
+ ### deploy
71
+ Deploys the model making it available for use. Requires the model to be registered.
72
+
73
+ ```ruby
74
+ MyMachineLearningModel.deploy
75
+ ```
76
+
77
+ The `deploy` operation is asynchronous and can take some time to complete. Use the `wait_until_complete` method in combination with `deployed?` to wait until the model is deployed.
78
+
79
+ ```ruby
80
+ MyMachineLearningModel.deploy do |model|
81
+ model.wait_until_complete(sleep_time: 5) do
82
+ model.deployed?
83
+ end
84
+ end
85
+ ```
86
+
87
+ ### undeploy
88
+ Undeploys the model.
89
+
90
+ ```ruby
91
+ MyMachineLearningModel.undeploy
92
+ ```
93
+
94
+ ### deployed?
95
+ Gets the model and checks if `model_state` is `DEPLOYED`
96
+
97
+ ```ruby
98
+ MyMachineLearningModel.deployed?
99
+ ```
100
+
101
+ ### delete
102
+ Deletes the model. The model must be undeployed before it can be deleted.
103
+
104
+ ```ruby
105
+ MyMachineLearningModel.delete
106
+ ```
107
+
108
+ ### wait_until_complete
109
+ A helper to provide wait for completion of async tasks. Accepts `max_attempts` and `sleep_time`.
110
+
111
+ ```ruby
112
+ MyMLModel.register do |model|
113
+ model.wait_until_complete(max_attempts: 20, sleep_time: 4) do
114
+ # finish waiting if last statement is true
115
+ model.registered?
116
+ end
117
+ end
118
+ ```
119
+
120
+ ### all
121
+ Returns all registered models.
122
+
123
+ ```ruby
124
+ MyMLModel.all
125
+ ```
126
+
127
+ ## Pre-trained models
128
+ OpenSearch provides a variety of pre-trained models for different tasks:
129
+
130
+ ### Neural Sparse Models
131
+ - `:neural_sparse_encoding` - 'amazon/neural-sparse/opensearch-neural-sparse-encoding-v1'
132
+ - `:neural_sparse_encoding_doc` - 'amazon/neural-sparse/opensearch-neural-sparse-encoding-doc-v1'
133
+ - `:neural_sparse_tokenizer` - 'amazon/neural-sparse/opensearch-neural-sparse-tokenizer-v1'
134
+
135
+ ### Cross Encoder Models
136
+ - `:cross_encoder_minilm_6` - 'huggingface/cross-encoders/ms-marco-MiniLM-L-6-v2'
137
+ - `:cross_encoder_minilm_12` - 'huggingface/cross-encoders/ms-marco-MiniLM-L-12-v2'
138
+
139
+ ### Sentence Transformer Models
140
+ - `:sentence_transformers_roberta_all` - 'huggingface/sentence-transformers/all-distilroberta-v1'
141
+ - `:sentence_transformers_msmarco` - 'huggingface/sentence-transformers/msmarco-distilroberta-base-v2'
142
+ - `:sentence_transformers_minilm_6` - 'huggingface/sentence-transformers/all-MiniLM-L6-v2'
143
+ - `:sentence_transformers_minilm_12` - 'huggingface/sentence-transformers/all-MiniLM-L12-v2'
144
+ - `:sentence_transformers_mpnet` - 'huggingface/sentence-transformers/all-mpnet-base-v'
145
+ - `:sentence_transformers_multi_qa_minilm_6` - 'huggingface/sentence-transformers/multi-qa-MiniLM-L6-cos-v1'
146
+ - `:sentence_transformers_multi_qa_mpnet` - 'huggingface/sentence-transformers/multi-qa-mpnet-base-dot-v1'
147
+ - `:sentence_transformers_paraphrase_minilm_3` - 'huggingface/sentence-transformers/paraphrase-MiniLM-L3-v2'
148
+ - `:sentence_transformers_paraphrase_multilingual_minilm_12` - 'huggingface/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
149
+ - `:sentence_transformers_paraphrase_mpnet` - 'huggingface/sentence-transformers/paraphrase-mpnet-base-v2'
150
+ - `:sentence_transformers_multilingual_distiluse_cased` - 'huggingface/sentence-transformers/distiluse-base-multilingual-cased-v1'
151
+
152
+ ## Custom Models
153
+
154
+ Refer to the OpenSearch documentation on [deploying custom local models](https://opensearch.org/docs/latest/ml-commons-plugin/custom-local-models/)