goldmine 2.1.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +0 -1
- data/Gemfile.lock +14 -29
- data/README.md +134 -136
- data/lib/goldmine.rb +7 -13
- data/lib/goldmine/miner.rb +20 -0
- data/lib/goldmine/pivot.rb +52 -0
- data/lib/goldmine/pivot_result.rb +21 -0
- data/lib/goldmine/rollup.rb +41 -0
- data/lib/goldmine/rollup_clean_room.rb +21 -0
- data/lib/goldmine/rollup_result.rb +45 -0
- data/lib/goldmine/version.rb +1 -1
- data/test/test_goldmine.rb +149 -277
- metadata +11 -13
- data/goldmine.gemspec +0 -20
- data/lib/goldmine/array_miner.rb +0 -67
- data/lib/goldmine/cache.rb +0 -28
- data/lib/goldmine/hash_miner.rb +0 -81
- data/lib/goldmine/hash_rollup.rb +0 -83
- data/lib/goldmine/rollup_context.rb +0 -23
- data/license.md +0 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d630803c577229ae72b1ca7bd4e7125840e3f9b6
|
4
|
+
data.tar.gz: 8684e82fa2691af08618d5cb845ec500c99176b5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bfc80b08483357bd6cb61260782b7424cff279c8bbdc8d59247edc54fb2e0ed070d9d6797a063bc5af6ab767547854beb9ca1929f20dee9a94b2406081bb550c
|
7
|
+
data.tar.gz: c7dab7e7f518893b98f5213238d10ad0975b9556209084267d7de1bdec1612bde78e3483e27490f201dd8e5c2ed1cd6924723a0bab6b828995123cd9f011faa7
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,41 +1,33 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
goldmine (
|
4
|
+
goldmine (3.0.0)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
9
|
binding_of_caller (0.7.2)
|
10
10
|
debug_inspector (>= 0.0.1)
|
11
|
-
byebug (
|
12
|
-
|
13
|
-
|
14
|
-
columnize (0.9.0)
|
15
|
-
coveralls (0.8.3)
|
11
|
+
byebug (8.2.4)
|
12
|
+
coderay (1.1.1)
|
13
|
+
coveralls (0.8.13)
|
16
14
|
json (~> 1.8)
|
17
|
-
|
18
|
-
simplecov (~> 0.10.0)
|
15
|
+
simplecov (~> 0.11.0)
|
19
16
|
term-ansicolor (~> 1.3)
|
20
17
|
thor (~> 0.19.1)
|
18
|
+
tins (~> 1.6.0)
|
21
19
|
debug_inspector (0.0.2)
|
22
20
|
docile (1.1.5)
|
23
|
-
domain_name (0.5.25)
|
24
|
-
unf (>= 0.0.5, < 1.0.0)
|
25
|
-
http-cookie (1.0.2)
|
26
|
-
domain_name (~> 0.5)
|
27
21
|
interception (0.5)
|
28
22
|
json (1.8.3)
|
29
23
|
method_source (0.8.2)
|
30
|
-
mime-types (2.6.2)
|
31
|
-
netrc (0.10.3)
|
32
24
|
os (0.9.6)
|
33
25
|
pry (0.10.3)
|
34
26
|
coderay (~> 1.1.0)
|
35
27
|
method_source (~> 0.8.1)
|
36
28
|
slop (~> 3.4)
|
37
|
-
pry-byebug (3.
|
38
|
-
byebug (~>
|
29
|
+
pry-byebug (3.3.0)
|
30
|
+
byebug (~> 8.0)
|
39
31
|
pry (~> 0.10)
|
40
32
|
pry-rescue (1.4.2)
|
41
33
|
interception (>= 0.5)
|
@@ -52,29 +44,22 @@ GEM
|
|
52
44
|
rack (1.6.4)
|
53
45
|
rack-protection (1.5.3)
|
54
46
|
rack
|
55
|
-
rake (
|
56
|
-
|
57
|
-
http-cookie (>= 1.0.2, < 2.0)
|
58
|
-
mime-types (>= 1.16, < 3.0)
|
59
|
-
netrc (~> 0.7)
|
60
|
-
simplecov (0.10.0)
|
47
|
+
rake (11.1.2)
|
48
|
+
simplecov (0.11.2)
|
61
49
|
docile (~> 1.1.0)
|
62
50
|
json (~> 1.8)
|
63
51
|
simplecov-html (~> 0.10.0)
|
64
52
|
simplecov-html (0.10.0)
|
65
|
-
sinatra (1.4.
|
66
|
-
rack (~> 1.
|
53
|
+
sinatra (1.4.7)
|
54
|
+
rack (~> 1.5)
|
67
55
|
rack-protection (~> 1.4)
|
68
56
|
tilt (>= 1.3, < 3)
|
69
57
|
slop (3.6.0)
|
70
58
|
term-ansicolor (1.3.2)
|
71
59
|
tins (~> 1.0)
|
72
60
|
thor (0.19.1)
|
73
|
-
tilt (2.0.
|
61
|
+
tilt (2.0.2)
|
74
62
|
tins (1.6.0)
|
75
|
-
unf (0.1.4)
|
76
|
-
unf_ext
|
77
|
-
unf_ext (0.0.7.1)
|
78
63
|
|
79
64
|
PLATFORMS
|
80
65
|
ruby
|
@@ -87,4 +72,4 @@ DEPENDENCIES
|
|
87
72
|
sinatra
|
88
73
|
|
89
74
|
BUNDLED WITH
|
90
|
-
1.
|
75
|
+
1.11.2
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
[](http://blog.codinghorror.com/the-best-code-is-no-code-at-all/)
|
2
2
|
[](https://codeclimate.com/github/hopsoft/goldmine)
|
3
3
|
[](https://gemnasium.com/hopsoft/goldmine)
|
4
4
|
[](https://travis-ci.org/hopsoft/goldmine)
|
@@ -7,7 +7,7 @@
|
|
7
7
|
|
8
8
|
# Goldmine
|
9
9
|
|
10
|
-
Extract a wealth of information from Arrays
|
10
|
+
Extract a wealth of information from Arrays.
|
11
11
|
|
12
12
|
Goldmine is especially helpful when working with source data that is difficult to query.
|
13
13
|
e.g. CSV files, API results, etc...
|
@@ -28,158 +28,129 @@ gem install goldmine
|
|
28
28
|
|
29
29
|
```ruby
|
30
30
|
require "goldmine"
|
31
|
-
|
32
|
-
list = [1,2,3,4,5,6,7,8,9]
|
33
|
-
Goldmine::ArrayMiner.new(list)
|
34
|
-
.pivot { |i| i < 5 }
|
35
|
-
# result:
|
36
|
-
{
|
37
|
-
true => [1, 2, 3, 4],
|
38
|
-
false => [5, 6, 7, 8, 9]
|
39
|
-
}
|
40
31
|
```
|
41
32
|
|
42
|
-
## Chained Pivots
|
43
|
-
|
44
33
|
```ruby
|
45
34
|
list = [1,2,3,4,5,6,7,8,9]
|
46
|
-
Goldmine::ArrayMiner.new(list)
|
47
|
-
.pivot { |i| i < 5 }
|
48
|
-
.pivot { |i| i % 2 == 0 }
|
49
|
-
# result:
|
50
|
-
{
|
51
|
-
[true, false] => [1, 3],
|
52
|
-
[true, true] => [2, 4],
|
53
|
-
[false, false] => [5, 7, 9],
|
54
|
-
[false, true] => [6, 8]
|
55
|
-
}
|
56
|
-
```
|
57
35
|
|
58
|
-
|
36
|
+
Goldmine(list)
|
37
|
+
.pivot("< 5") { |i| i < 5 }
|
38
|
+
.result
|
39
|
+
.to_h
|
40
|
+
```
|
59
41
|
|
60
42
|
```ruby
|
61
|
-
list = [1,2,3,4,5,6,7,8,9]
|
62
|
-
Goldmine::ArrayMiner.new(list)
|
63
|
-
.pivot(:less_than_5) { |i| i < 5 }
|
64
|
-
# result:
|
65
43
|
{
|
66
|
-
|
67
|
-
|
44
|
+
[["< 5", true]] => [1, 2, 3, 4],
|
45
|
+
[["< 5", false]] => [5, 6, 7, 8, 9]
|
68
46
|
}
|
69
47
|
```
|
70
48
|
|
71
|
-
## Value Pivots
|
49
|
+
## Array Value Pivots
|
72
50
|
|
73
51
|
```ruby
|
74
|
-
|
52
|
+
users = [
|
75
53
|
{ :name => "Sally", :favorite_colors => [:blue] },
|
76
54
|
{ :name => "John", :favorite_colors => [:blue, :green] },
|
77
55
|
{ :name => "Stephen", :favorite_colors => [:red, :pink, :purple] },
|
78
56
|
{ :name => "Emily", :favorite_colors => [:orange, :green] },
|
79
57
|
{ :name => "Joe", :favorite_colors => [:red] }
|
80
58
|
]
|
81
|
-
|
82
|
-
|
83
|
-
|
59
|
+
|
60
|
+
Goldmine(users)
|
61
|
+
.pivot(:favorite_color) { |record| record[:favorite_colors] }
|
62
|
+
.result
|
63
|
+
.to_h
|
64
|
+
```
|
65
|
+
|
66
|
+
```ruby
|
84
67
|
{
|
85
|
-
:blue => [
|
86
|
-
|
87
|
-
{
|
88
|
-
],
|
89
|
-
:
|
90
|
-
|
91
|
-
{ :name => "Emily", :favorite_colors => [:orange, :green] }
|
92
|
-
],
|
93
|
-
:red => [
|
94
|
-
{ :name => "Stephen", :favorite_colors => [:red, :pink, :purple] },
|
95
|
-
{ :name => "Joe", :favorite_colors => [:red] }
|
96
|
-
],
|
97
|
-
:pink => [
|
98
|
-
{ :name => "Stephen", :favorite_colors => [:red, :pink, :purple] }
|
99
|
-
],
|
100
|
-
:purple => [
|
101
|
-
{ :name => "Stephen", :favorite_colors => [:red, :pink, :purple] }
|
102
|
-
],
|
103
|
-
:orange => [
|
104
|
-
{ :name => "Emily", :favorite_colors => [:orange, :green] }
|
105
|
-
]
|
68
|
+
[:favorite_color, :blue] => [{:name=>"Sally", :favorite_colors=>[:blue]}, {:name=>"John", :favorite_colors=>[:blue, :green]}],
|
69
|
+
[:favorite_color, :green] => [{:name=>"John", :favorite_colors=>[:blue, :green]}, {:name=>"Emily", :favorite_colors=>[:orange, :green]}],
|
70
|
+
[:favorite_color, :red] => [{:name=>"Stephen", :favorite_colors=>[:red, :pink, :purple]}, {:name=>"Joe", :favorite_colors=>[:red]}],
|
71
|
+
[:favorite_color, :pink] => [{:name=>"Stephen", :favorite_colors=>[:red, :pink, :purple]}],
|
72
|
+
[:favorite_color, :purple] => [{:name=>"Stephen", :favorite_colors=>[:red, :pink, :purple]}],
|
73
|
+
[:favorite_color, :orange] => [{:name=>"Emily", :favorite_colors=>[:orange, :green]}]
|
106
74
|
}
|
107
75
|
```
|
108
76
|
|
109
|
-
##
|
77
|
+
## Chained pivots
|
110
78
|
|
111
79
|
```ruby
|
112
|
-
|
80
|
+
users = [
|
113
81
|
{ :name => "Sally", :age => 21 },
|
114
82
|
{ :name => "John", :age => 28 },
|
115
83
|
{ :name => "Stephen", :age => 37 },
|
116
84
|
{ :name => "Emily", :age => 32 },
|
117
85
|
{ :name => "Joe", :age => 18 }
|
118
86
|
]
|
119
|
-
|
120
|
-
|
121
|
-
!!
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
87
|
+
|
88
|
+
Goldmine(users).
|
89
|
+
pivot("'e' in name") { |user| !!user[:name].match(/e/i) }.
|
90
|
+
pivot("21 or over") { |user| user[:age] >= 21 }.
|
91
|
+
result.
|
92
|
+
to_h
|
93
|
+
```
|
94
|
+
|
95
|
+
```ruby
|
127
96
|
{
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
],
|
132
|
-
{ "Name has an 'e'" => true, ">= 21 years old" => true } => [
|
133
|
-
{ :name => "Stephen", :age => 37 },
|
134
|
-
{ :name => "Emily", :age => 32 }
|
135
|
-
],
|
136
|
-
{ "Name has an 'e'" => true, ">= 21 years old" => false } => [
|
137
|
-
{ :name => "Joe", :age => 18 }
|
138
|
-
]
|
97
|
+
[["'e' in name", false], ["21 or over", true]] => [{:name=>"Sally", :age=>21}, {:name=>"John", :age=>28}],
|
98
|
+
[["'e' in name", true], ["21 or over", true]] => [{:name=>"Stephen", :age=>37}, {:name=>"Emily", :age=>32}],
|
99
|
+
[["'e' in name", true], ["21 or over", false]] => [{:name=>"Joe", :age=>18}]
|
139
100
|
}
|
140
101
|
```
|
141
102
|
|
142
103
|
## Rollups
|
143
104
|
|
144
|
-
|
145
|
-
|
105
|
+
An intuitive way to aggregate pivoted data...
|
106
|
+
i.e. computed columns.
|
146
107
|
|
147
|
-
|
148
|
-
|
108
|
+
Rollups are `blocks` that get executed once for each pivot entry.
|
109
|
+
_They can be also be chained._
|
149
110
|
|
150
111
|
```ruby
|
151
112
|
list = [1,2,3,4,5,6,7,8,9]
|
152
|
-
|
153
|
-
|
154
|
-
.pivot(
|
155
|
-
.
|
156
|
-
|
113
|
+
|
114
|
+
Goldmine(list)
|
115
|
+
.pivot("< 5") { |i| i < 5 }
|
116
|
+
.pivot("even") { |i| i % 2 == 0 }
|
117
|
+
.result
|
118
|
+
.rollup("count", &:count)
|
119
|
+
.result
|
120
|
+
.to_h
|
121
|
+
```
|
122
|
+
|
123
|
+
```ruby
|
157
124
|
{
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
125
|
+
[["< 5", true], ["even", false]] => [["count", 2]],
|
126
|
+
[["< 5", true], ["even", true]] => [["count", 2]],
|
127
|
+
[["< 5", false], ["even", false]] => [["count", 3]],
|
128
|
+
[["< 5", false], ["even", true]] => [["count", 2]]
|
162
129
|
}
|
163
130
|
```
|
164
131
|
|
165
|
-
###
|
132
|
+
### Rollup Caching
|
166
133
|
|
167
|
-
Rollups can be computationally expensive
|
168
|
-
|
134
|
+
Rollups can be computationally expensive.
|
135
|
+
Optional caching can be used to reduce this computational overhead.
|
169
136
|
|
170
137
|
```ruby
|
171
138
|
list = [1,2,3,4,5,6,7,8,9]
|
172
|
-
|
139
|
+
|
140
|
+
Goldmine(list)
|
173
141
|
.pivot(:less_than_5) { |i| i < 5 }
|
174
|
-
.
|
175
|
-
.rollup(:
|
176
|
-
.rollup(:
|
177
|
-
|
178
|
-
|
179
|
-
|
142
|
+
.result
|
143
|
+
.rollup(:count, &:count)
|
144
|
+
.rollup(:evens) { |list| list.select { |i| i % 2 == 0 }.count }
|
145
|
+
.rollup(:even_percentage) { |list| cache[:evens] / cache[:count].to_f }
|
146
|
+
.result(cache: true)
|
147
|
+
.to_h
|
148
|
+
```
|
149
|
+
|
150
|
+
```ruby
|
180
151
|
{
|
181
|
-
|
182
|
-
|
152
|
+
[[:less_than_5, true]] => [[:count, 4], [:evens, 2], [:even_percentage, 0.5]],
|
153
|
+
[[:less_than_5, false]] => [[:count, 5], [:evens, 2], [:even_percentage, 0.4]]
|
183
154
|
}
|
184
155
|
```
|
185
156
|
|
@@ -189,18 +160,35 @@ It's often helpful to flatten rollups into rows.
|
|
189
160
|
|
190
161
|
```ruby
|
191
162
|
list = [1,2,3,4,5,6,7,8,9]
|
192
|
-
|
163
|
+
|
164
|
+
rollup = Goldmine(list)
|
193
165
|
.pivot(:less_than_5) { |i| i < 5 }
|
194
|
-
.
|
195
|
-
.rollup(:
|
196
|
-
.rollup(:
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
166
|
+
.result
|
167
|
+
.rollup(:count, &:count)
|
168
|
+
.rollup(:evens) { |list| list.select { |i| i % 2 == 0 }.count }
|
169
|
+
.rollup(:even_percentage) { |list| cache[:evens] / cache[:count].to_f }
|
170
|
+
.result(cache: true)
|
171
|
+
```
|
172
|
+
|
173
|
+
```ruby
|
174
|
+
rollup.to_rows
|
175
|
+
```
|
176
|
+
|
177
|
+
```ruby
|
201
178
|
[
|
202
|
-
|
203
|
-
|
179
|
+
[[:less_than_5, true], [:count, 4], [:evens, 2], [:even_percentage, 0.5]],
|
180
|
+
[[:less_than_5, false], [:count, 5], [:evens, 2], [:even_percentage, 0.4]]
|
181
|
+
]
|
182
|
+
```
|
183
|
+
|
184
|
+
```ruby
|
185
|
+
rollup.to_hash_rows
|
186
|
+
```
|
187
|
+
|
188
|
+
```ruby
|
189
|
+
[
|
190
|
+
{:less_than_5=>true, :count=>4, :evens=>2, :even_percentage=>0.5},
|
191
|
+
{:less_than_5=>false, :count=>5, :evens=>2, :even_percentage=>0.4}
|
204
192
|
]
|
205
193
|
```
|
206
194
|
|
@@ -210,14 +198,19 @@ Rollups can also be converted into tabular format.
|
|
210
198
|
|
211
199
|
```ruby
|
212
200
|
list = [1,2,3,4,5,6,7,8,9]
|
213
|
-
|
201
|
+
|
202
|
+
Goldmine(list)
|
214
203
|
.pivot(:less_than_5) { |i| i < 5 }
|
215
204
|
.pivot(:even) { |i| i % 2 == 0 }
|
216
|
-
.
|
205
|
+
.result
|
206
|
+
.rollup(:count, &:size)
|
207
|
+
.result
|
217
208
|
.to_tabular
|
218
|
-
|
209
|
+
```
|
210
|
+
|
211
|
+
```ruby
|
219
212
|
[
|
220
|
-
[
|
213
|
+
[:less_than_5, :even, :count],
|
221
214
|
[true, false, 2],
|
222
215
|
[true, true, 2],
|
223
216
|
[false, false, 3],
|
@@ -230,22 +223,25 @@ Goldmine::ArrayMiner.new(list)
|
|
230
223
|
Goldmine makes producing CSV output simple.
|
231
224
|
|
232
225
|
```ruby
|
233
|
-
|
226
|
+
list = [1,2,3,4,5,6,7,8,9]
|
227
|
+
|
228
|
+
Goldmine(list)
|
234
229
|
.pivot(:less_than_5) { |i| i < 5 }
|
235
230
|
.pivot(:even) { |i| i % 2 == 0 }
|
231
|
+
.result
|
236
232
|
.rollup(:count) { |matched| matched.size }
|
233
|
+
.result
|
237
234
|
.to_csv_table
|
238
|
-
|
239
|
-
|
235
|
+
.to_csv
|
236
|
+
```
|
240
237
|
|
241
|
-
|
242
|
-
# result:
|
238
|
+
```ruby
|
243
239
|
"less_than_5,even,count\ntrue,false,2\ntrue,true,2\nfalse,false,3\nfalse,true,2\n"
|
244
240
|
```
|
245
241
|
|
246
|
-
##
|
242
|
+
## Example Apps
|
247
243
|
|
248
|
-
All examples are
|
244
|
+
All examples are small Sinatra apps.
|
249
245
|
They are designed to help communicate Goldmine use-cases.
|
250
246
|
|
251
247
|
### Setup
|
@@ -258,7 +254,9 @@ bundle
|
|
258
254
|
|
259
255
|
### [New York Wifi Hotspots](https://github.com/hopsoft/goldmine/tree/master/examples/new_york_wifi_hotspots)
|
260
256
|
|
261
|
-
|
257
|
+
Uses data from https://github.com/hopsoft/goldmine/blob/master/examples/new_york_wifi_hotspots/DOITT_WIFI_HOTSPOT_01_13SEPT2010.csv
|
258
|
+
|
259
|
+
In this example, we mine out the following information.
|
262
260
|
|
263
261
|
* Total hotspots by city, zip, & area code
|
264
262
|
* Free hotspots by city, zip, & area code
|
@@ -284,7 +282,7 @@ curl http://localhost:3000/csv
|
|
284
282
|
|
285
283
|
Uses data from http://dev.socrata.com/foundry/#/data.medicare.gov/aeay-dfax
|
286
284
|
|
287
|
-
In this example, we mine the following
|
285
|
+
In this example, we mine out the following information.
|
288
286
|
|
289
287
|
* Total doctors by state & specialty
|
290
288
|
* Preferred doctors by state & specialty
|
@@ -319,22 +317,22 @@ My Macbook Pro yields the following benchmarks.
|
|
319
317
|
|
320
318
|
```
|
321
319
|
user system total real
|
322
|
-
pivoted
|
323
|
-
rolled_up
|
324
|
-
rows 0.
|
325
|
-
tabular 0.010000 0.000000 0.010000 ( 0.
|
326
|
-
csv 0.
|
320
|
+
pivoted 0.630000 0.030000 0.660000 ( 0.670409)
|
321
|
+
rolled_up 0.570000 0.030000 0.600000 ( 0.626413)
|
322
|
+
rows 0.010000 0.000000 0.010000 ( 0.003258)
|
323
|
+
tabular 0.010000 0.000000 0.010000 ( 0.010110)
|
324
|
+
csv 0.050000 0.000000 0.050000 ( 0.057677)
|
327
325
|
```
|
328
326
|
|
329
327
|
##### 1,000,000 Records
|
330
328
|
|
331
329
|
```
|
332
330
|
user system total real
|
333
|
-
pivoted
|
334
|
-
rolled_up
|
335
|
-
rows 0.
|
336
|
-
tabular 0.010000 0.
|
337
|
-
csv 0.
|
331
|
+
pivoted 7.270000 0.300000 7.570000 ( 8.053166)
|
332
|
+
rolled_up 6.800000 0.830000 7.630000 ( 8.051707)
|
333
|
+
rows 0.000000 0.000000 0.000000 ( 0.003934)
|
334
|
+
tabular 0.010000 0.000000 0.010000 ( 0.011825)
|
335
|
+
csv 0.210000 0.010000 0.220000 ( 0.222752)
|
338
336
|
```
|
339
337
|
|
340
338
|
## Summary
|