km-db 0.2.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (64) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +7 -0
  3. data/.ruby-version +1 -0
  4. data/Gemfile +2 -4
  5. data/Gemfile.lock +179 -20
  6. data/Procfile +2 -0
  7. data/Procfile.work +1 -0
  8. data/README.md +186 -0
  9. data/Rakefile +1 -0
  10. data/bin/kmdb-flush +13 -0
  11. data/bin/kmdb-import +13 -0
  12. data/bin/kmdb-partition +15 -0
  13. data/bin/kmdb-pool +8 -0
  14. data/bin/kmdb-realias +12 -0
  15. data/bin/kmdb-ui +6 -0
  16. data/bin/kmdb-work +17 -0
  17. data/config/amazon-rds-ca-cert.pem +260 -0
  18. data/config.ru +8 -0
  19. data/km-db.gemspec +17 -17
  20. data/lib/kmdb/{belongs_to_user.rb → concerns/belongs_to_user.rb} +3 -3
  21. data/lib/kmdb/concerns/has_properties.rb +35 -0
  22. data/lib/kmdb/jobs/find_files.rb +32 -0
  23. data/lib/kmdb/jobs/list_files.rb +37 -0
  24. data/lib/kmdb/jobs/locked.rb +10 -0
  25. data/lib/kmdb/jobs/parse_file.rb +109 -0
  26. data/lib/kmdb/jobs/record_batch.rb +65 -0
  27. data/lib/kmdb/jobs/redo_unaliasing.rb +31 -0
  28. data/lib/kmdb/jobs/unalias_user.rb +32 -0
  29. data/lib/kmdb/migrations/01_kmdb_initial.rb +78 -0
  30. data/lib/kmdb/migrations/02_kmdb_partitions.rb +28 -0
  31. data/lib/kmdb/migrations/03_kmdb_blacklist.rb +20 -0
  32. data/lib/kmdb/models/alias.rb +36 -0
  33. data/lib/kmdb/models/blacklisted_property.rb +20 -0
  34. data/lib/kmdb/models/custom_record.rb +53 -0
  35. data/lib/kmdb/models/dumpfile.rb +33 -0
  36. data/lib/kmdb/models/event.rb +56 -0
  37. data/lib/kmdb/models/event_batch.rb +72 -0
  38. data/lib/kmdb/models/global_uid.rb +42 -0
  39. data/lib/kmdb/models/ignored_user.rb +20 -0
  40. data/lib/kmdb/models/json_file.rb +56 -0
  41. data/lib/kmdb/models/key.rb +28 -0
  42. data/lib/kmdb/models/property.rb +44 -0
  43. data/lib/kmdb/models/s3_object.rb +54 -0
  44. data/lib/kmdb/models/user.rb +53 -0
  45. data/lib/kmdb/models/whitelisted_event.rb +20 -0
  46. data/lib/kmdb/parser.rb +4 -4
  47. data/lib/kmdb/redis.rb +17 -0
  48. data/lib/kmdb/resque.rb +38 -0
  49. data/lib/kmdb/s3_bucket.rb +33 -0
  50. data/lib/kmdb/services/partitioner.rb +65 -0
  51. data/lib/kmdb/version.rb +1 -1
  52. data/lib/kmdb.rb +31 -6
  53. metadata +236 -186
  54. data/README.markdown +0 -91
  55. data/bin/km_db_import +0 -36
  56. data/lib/kmdb/custom_record.rb +0 -54
  57. data/lib/kmdb/dumpfile.rb +0 -23
  58. data/lib/kmdb/event.rb +0 -39
  59. data/lib/kmdb/has_properties.rb +0 -33
  60. data/lib/kmdb/key.rb +0 -56
  61. data/lib/kmdb/migration.rb +0 -63
  62. data/lib/kmdb/parallel_parser.rb +0 -85
  63. data/lib/kmdb/property.rb +0 -33
  64. data/lib/kmdb/user.rb +0 -83
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a1d53c84fa85914d141e481f75759df0a9cdaad0
4
+ data.tar.gz: 58605a95792bfd91bc5b3877b1460836cc883086
5
+ SHA512:
6
+ metadata.gz: 5dfb47a0f99adb83c288792df7fbe08b37474048844e966f604180da8c5885715e6aa7a04c866b50e257df5f3db322546697bfba9b11400342a2a33690a6a461
7
+ data.tar.gz: 1b14b242a34d432516402b47e7ac1e7db70b82afe1f7761aaa23a43915226f99f47cc7d21a06d2a9ad417a0fff843fd44c784fb20d64d87d6aff25538728419e
data/.gitignore ADDED
@@ -0,0 +1,7 @@
1
+ data/
2
+ tmp/
3
+ pkg/
4
+ tags
5
+ .DS_Store
6
+ .bundle
7
+
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 2.1.2
data/Gemfile CHANGED
@@ -1,6 +1,4 @@
1
- source :gemcutter
1
+ source ENV.fetch('GEM_SOURCE', 'https://rubygems.org')
2
2
 
3
- gem 'progressbar'
4
-
5
- # Specify your gem's dependencies in km.gemspec
6
3
  gemspec
4
+
data/Gemfile.lock CHANGED
@@ -1,25 +1,175 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- km-db (0.2.1)
5
- activerecord (~> 2.3.12)
4
+ km-db (0.3.2)
5
+ activerecord (~> 4.1)
6
6
  andand
7
- parallel
7
+ fog
8
+ foreman
9
+ mysql2
10
+ oj
8
11
  progressbar
9
- yajl-ruby
12
+ resque
13
+ resque-lock
10
14
 
11
15
  GEM
12
- remote: http://rubygems.org/
16
+ remote: https://rubygems.org/
13
17
  specs:
14
- activerecord (2.3.18)
15
- activesupport (= 2.3.18)
16
- activesupport (2.3.18)
18
+ CFPropertyList (2.3.1)
19
+ activemodel (4.2.2)
20
+ activesupport (= 4.2.2)
21
+ builder (~> 3.1)
22
+ activerecord (4.2.2)
23
+ activemodel (= 4.2.2)
24
+ activesupport (= 4.2.2)
25
+ arel (~> 6.0)
26
+ activesupport (4.2.2)
27
+ i18n (~> 0.7)
28
+ json (~> 1.7, >= 1.7.7)
29
+ minitest (~> 5.1)
30
+ thread_safe (~> 0.3, >= 0.3.4)
31
+ tzinfo (~> 1.1)
17
32
  andand (1.3.3)
33
+ arel (6.0.0)
34
+ builder (3.2.2)
35
+ coderay (1.1.0)
18
36
  diff-lcs (1.1.3)
19
- json (1.7.7)
20
- parallel (0.6.3)
21
- progressbar (0.20.0)
22
- rake (10.0.3)
37
+ excon (0.45.3)
38
+ fission (0.5.0)
39
+ CFPropertyList (~> 2.2)
40
+ fog (1.31.0)
41
+ fog-atmos
42
+ fog-aws (~> 0.0)
43
+ fog-brightbox (~> 0.4)
44
+ fog-core (~> 1.30)
45
+ fog-ecloud
46
+ fog-google (>= 0.0.2)
47
+ fog-json
48
+ fog-local
49
+ fog-powerdns (>= 0.1.1)
50
+ fog-profitbricks
51
+ fog-radosgw (>= 0.0.2)
52
+ fog-riakcs
53
+ fog-sakuracloud (>= 0.0.4)
54
+ fog-serverlove
55
+ fog-softlayer
56
+ fog-storm_on_demand
57
+ fog-terremark
58
+ fog-vmfusion
59
+ fog-voxel
60
+ fog-xml (~> 0.1.1)
61
+ ipaddress (~> 0.5)
62
+ nokogiri (~> 1.5, >= 1.5.11)
63
+ fog-atmos (0.1.0)
64
+ fog-core
65
+ fog-xml
66
+ fog-aws (0.5.0)
67
+ fog-core (~> 1.27)
68
+ fog-json (~> 1.0)
69
+ fog-xml (~> 0.1)
70
+ ipaddress (~> 0.8)
71
+ fog-brightbox (0.7.1)
72
+ fog-core (~> 1.22)
73
+ fog-json
74
+ inflecto (~> 0.0.2)
75
+ fog-core (1.31.1)
76
+ builder
77
+ excon (~> 0.45)
78
+ formatador (~> 0.2)
79
+ mime-types
80
+ net-scp (~> 1.1)
81
+ net-ssh (>= 2.1.3)
82
+ fog-ecloud (0.1.3)
83
+ fog-core
84
+ fog-xml
85
+ fog-google (0.0.5)
86
+ fog-core
87
+ fog-json
88
+ fog-xml
89
+ fog-json (1.0.2)
90
+ fog-core (~> 1.0)
91
+ multi_json (~> 1.10)
92
+ fog-local (0.2.1)
93
+ fog-core (~> 1.27)
94
+ fog-powerdns (0.1.1)
95
+ fog-core (~> 1.27)
96
+ fog-json (~> 1.0)
97
+ fog-xml (~> 0.1)
98
+ fog-profitbricks (0.0.3)
99
+ fog-core
100
+ fog-xml
101
+ nokogiri
102
+ fog-radosgw (0.0.4)
103
+ fog-core (>= 1.21.0)
104
+ fog-json
105
+ fog-xml (>= 0.0.1)
106
+ fog-riakcs (0.1.0)
107
+ fog-core
108
+ fog-json
109
+ fog-xml
110
+ fog-sakuracloud (1.0.1)
111
+ fog-core
112
+ fog-json
113
+ fog-serverlove (0.1.2)
114
+ fog-core
115
+ fog-json
116
+ fog-softlayer (0.4.6)
117
+ fog-core
118
+ fog-json
119
+ fog-storm_on_demand (0.1.1)
120
+ fog-core
121
+ fog-json
122
+ fog-terremark (0.1.0)
123
+ fog-core
124
+ fog-xml
125
+ fog-vmfusion (0.1.0)
126
+ fission
127
+ fog-core
128
+ fog-voxel (0.1.0)
129
+ fog-core
130
+ fog-xml
131
+ fog-xml (0.1.2)
132
+ fog-core
133
+ nokogiri (~> 1.5, >= 1.5.11)
134
+ foreman (0.78.0)
135
+ thor (~> 0.19.1)
136
+ formatador (0.2.5)
137
+ i18n (0.7.0)
138
+ inflecto (0.0.2)
139
+ ipaddress (0.8.0)
140
+ json (1.8.3)
141
+ method_source (0.8.2)
142
+ mime-types (2.6.1)
143
+ mini_portile (0.6.2)
144
+ minitest (5.7.0)
145
+ mono_logger (1.1.0)
146
+ multi_json (1.11.1)
147
+ mysql2 (0.3.18)
148
+ net-scp (1.2.1)
149
+ net-ssh (>= 2.6.5)
150
+ net-ssh (2.9.2)
151
+ nokogiri (1.6.6.2)
152
+ mini_portile (~> 0.6.0)
153
+ oj (2.12.9)
154
+ progressbar (0.21.0)
155
+ pry (0.10.0)
156
+ coderay (~> 1.1.0)
157
+ method_source (~> 0.8.1)
158
+ slop (~> 3.4)
159
+ rack (1.6.2)
160
+ rack-protection (1.5.3)
161
+ rack
162
+ rake (10.3.2)
163
+ redis (3.2.1)
164
+ redis-namespace (1.5.2)
165
+ redis (~> 3.0, >= 3.0.4)
166
+ resque (1.25.2)
167
+ mono_logger (~> 1.0)
168
+ multi_json (~> 1.0)
169
+ redis-namespace (~> 1.3)
170
+ sinatra (>= 0.9.2)
171
+ vegas (~> 0.1.2)
172
+ resque-lock (1.1.0)
23
173
  rspec (2.4.0)
24
174
  rspec-core (~> 2.4.0)
25
175
  rspec-expectations (~> 2.4.0)
@@ -28,19 +178,28 @@ GEM
28
178
  rspec-expectations (2.4.0)
29
179
  diff-lcs (~> 1.1.2)
30
180
  rspec-mocks (2.4.0)
31
- sqlite3 (1.3.7)
32
- sqlite3-ruby (1.3.3)
33
- sqlite3 (>= 1.3.3)
34
- yajl-ruby (1.1.0)
181
+ sinatra (1.4.6)
182
+ rack (~> 1.4)
183
+ rack-protection (~> 1.4)
184
+ tilt (>= 1.3, < 3)
185
+ slop (3.5.0)
186
+ thor (0.19.1)
187
+ thread_safe (0.3.5)
188
+ tilt (2.0.1)
189
+ tzinfo (1.2.2)
190
+ thread_safe (~> 0.1)
191
+ vegas (0.1.11)
192
+ rack (>= 1.0.0)
35
193
 
36
194
  PLATFORMS
37
195
  ruby
38
196
 
39
197
  DEPENDENCIES
40
- bundler (>= 1.0.0)
41
- json
198
+ bundler
42
199
  km-db!
43
- progressbar
200
+ pry
44
201
  rake
45
202
  rspec (~> 2.4.0)
46
- sqlite3-ruby
203
+
204
+ BUNDLED WITH
205
+ 1.10.4
data/Procfile ADDED
@@ -0,0 +1,2 @@
1
+ web: bundle exec rackup
2
+ workers: foreman start -f Procfile.work -c worker=$RESQUE_WORKERS
data/Procfile.work ADDED
@@ -0,0 +1 @@
1
+ resque: bundle exec bin/kmdb-work
data/README.md ADDED
@@ -0,0 +1,186 @@
1
+ # KMDB
2
+
3
+ The `km-db` gem should be useful to KissMetrics (KM) users.
4
+ Its aim is to efficiently process data obtained with KM's "Data Export" feature.
5
+
6
+ Its main feature is to import dumps directly from S3 into a SQL database,
7
+ optimized for typical queries (in particular, partitioned along the time
8
+ dimension).
9
+
10
+ Once imported, you can run complex queries against your visit history, for
11
+ instance run multivariate analysis.
12
+
13
+ Beware though, KM data can be huge, and processing it is taxing!
14
+
15
+
16
+ ## Installing
17
+
18
+ If you want to run "just" KM-DB, you might want to just use [the
19
+ app](https://github.com/HouseTrip/km-db-app).
20
+
21
+ Otherwise, add this to your Gemfile if you're using Bundler:
22
+
23
+ gem 'km-db'
24
+
25
+ ### Configuration
26
+
27
+ KMDB is configured through environment variables. We recommend storing this
28
+ settings in a `.env` file if running locally, and using [foreman]() to start
29
+ KMDB commands with the environment set.
30
+
31
+
32
+ ### Preparing your database
33
+
34
+ KMDB requires a MySQL database (to store events, properties, etc) and a Redis
35
+ store running (to store batch jobs and cache data).
36
+
37
+ Set the following:
38
+
39
+ - `DATABASE_URL` (required), e.g. `mysql2://km_db_test@localhost/km_db_test`
40
+ - `KMDB_REDIS_URL` [localhost], e.g. `redis://localhost/14`
41
+
42
+ Then run:
43
+
44
+ $ kmdb-flush
45
+
46
+ to prepare your database.
47
+
48
+
49
+ ### Optimizing your database
50
+
51
+ If your dataset is large (over 1 million events), KMDB can [partition]() your
52
+ database, i.e. transparently split large tables into smaller buckets of
53
+ continuous time periods.
54
+
55
+ Set the following:
56
+
57
+ - `KMDB_MIN_DATE` (required), e.g. '2014-01-01'
58
+ - `KMDB_MAX_DATE` (required), e.g. '2016-01-01'
59
+ - `KMDB_DAYS_PER_PARTITION` (required), e.g. '7'
60
+
61
+ Then run:
62
+
63
+ $ kmdb-partition
64
+
65
+ Notes:
66
+
67
+ - MySQL only supports up to 1024 partitions.
68
+ - You shoud aim for less than 1 million events per partitions for performance.
69
+ - You should run this _before_ importing data, but it's possible to re-run it.
70
+ The `MIN_DATE` will be ignored, and partitions will be added up to the new
71
+ `MAX_DATE` (if larger).
72
+
73
+
74
+ ## Importing data
75
+
76
+ KMDB will fetch JSON files form the S3 bucket where you instructed KissMetrics
77
+ to back up your data, parse them, and store information in the database.
78
+
79
+ It does so using [resque]() for high parallelism of the import process; in our
80
+ experience, it's perfectly possible to import 100GB of data in a few hours.
81
+
82
+ Set the following:
83
+
84
+ - `RESQUE_WORKERS` (1), number of worker nodes.
85
+ - `KMDB_MIN_REVISION` (optional, default 1), first KissMetrics revision file you want to import.
86
+ - `KMDB_REVISION_LOOKAHEAD` (10), how many revision files to check after the last known one
87
+ - `KMDB_BATCH_SIZE` (100), how many events to process per batch (advisory, may
88
+ be higher as an entire second's worth of events will always be processed in one
89
+ batch to preserve ordering).
90
+ - `AWS_BUCKET` (required), the name of the S3 bucket where the data is stored.
91
+ - `AWS_ACCESS_KEY_ID` (required).
92
+ - `AWS_SECRET_ACCESS_KEY` (required).
93
+
94
+
95
+ ### Ignoring some users
96
+
97
+ You may want to ignore all events and properties for certain users, for instance
98
+ the administrative users of your site (or employees).
99
+
100
+ Simply add their identities to the `ignored_users` table before import.
101
+
102
+
103
+ ### Whitelisting events
104
+
105
+ It's typical to have some noisy and/or shorter-lived events sent to KissMetrics,
106
+ e.g. for testing purposes or for temporary monitoring.
107
+
108
+ Should you only want to import certain events, add their names to the
109
+ `whitelisted_events` table before starting import.
110
+
111
+ If the table is left empty, all events will be imported.
112
+
113
+
114
+ ### Dealiasing users
115
+
116
+ When KissMetrics finds a way to tie two user identities as being a single actual
117
+ user, it stores an "aliasing" event.
118
+ KMDB de-aliases users automatically during import, and will store all events and
119
+ properties against a single user identity (one that's numeric if any, otherwise
120
+ the lexicographically lowest).
121
+
122
+
123
+ ## Using imported data
124
+
125
+ ### Using SQL directly
126
+
127
+ KMDB tries to stay close to the KissMetrics data, leaving you to interpret it.
128
+ As such, the main tables are unsurprisingly `events` and `properties`.
129
+
130
+ Here's a summary of the data model:
131
+
132
+ `events` has one row for each imported event:
133
+
134
+ | **events** |
135
+ |------------|
136
+ | id |
137
+ | t | the event timestamp |
138
+ | n | reference to the event name |
139
+ | user_id | reference to the user |
140
+
141
+ `properties` has one row for each property ever set on events or users
142
+
143
+ | **properties** |
144
+ |----------------|
145
+ | id |
146
+ | t | timestamp at which the property was set |
147
+ | key | reference to the property name |
148
+ | value | value (string) |
149
+ | user_id | reference to the user |
150
+ | event_id | reference to the event (may be NULL) |
151
+
152
+ `events.n` and `properties.key` reference the `id` column of the `keys` table;
153
+ this is done for performance reasons (event and property names are only stored
154
+ once):
155
+
156
+ | **keys** |
157
+ |----------|
158
+ | id |
159
+ | string |
160
+
161
+ KMDB also keeps the original user identities around in `users`, although you'll
162
+ probably never need them:
163
+
164
+ | **users** |
165
+ |-----------|
166
+ | id |
167
+ | name | the identity given by KissMetrics |
168
+
169
+ as well as all aliasing events:
170
+
171
+ | **aliases** |
172
+ |-------------|
173
+ | id |
174
+ | name1 |
175
+ | name2 |
176
+
177
+
178
+ ### Using ActiveRecord
179
+
180
+ The `KMDB` module exposes four `ActiveRecord` classes:
181
+ `Event`, `Property`, `User` are the main domain objects.
182
+
183
+ `Key` is used to intern strings (event and property names) for performance.
184
+
185
+ Please consult the source of these models for details.
186
+
data/Rakefile CHANGED
@@ -3,3 +3,4 @@ require 'rspec/core/rake_task'
3
3
  Bundler::GemHelper.install_tasks
4
4
 
5
5
  RSpec::Core::RakeTask.new(:spec)
6
+
data/bin/kmdb-flush ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+ require 'kmdb'
3
+ require 'kmdb/redis'
4
+
5
+ KMDB.connect.migrate
6
+
7
+ KMDB.transaction do |c|
8
+ %w(aliases dumpfiles events properties users).each do |table|
9
+ c.execute "TRUNCATE TABLE #{table}"
10
+ end
11
+ end
12
+
13
+ KMDB::Redis.connection.flushdb
data/bin/kmdb-import ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+ =begin
3
+
4
+ Import KM events from the raw dumps.
5
+
6
+ =end
7
+ require 'kmdb'
8
+ require 'kmdb/resque'
9
+ require 'kmdb/jobs/find_files'
10
+ require 'kmdb/jobs/list_files'
11
+
12
+ KMDB::Resque.enqueue(KMDB::Jobs::FindFiles)
13
+
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ require 'kmdb'
3
+ require 'kmdb/models/event'
4
+ require 'kmdb/models/alias'
5
+ require 'kmdb/models/property'
6
+ require 'kmdb/services/partitioner'
7
+
8
+ KMDB.connect.migrate
9
+
10
+ module KMDB
11
+ [Event, Property, Alias].each do |model|
12
+ Services::Partitioner.new(model: model).run
13
+ end
14
+ end
15
+
data/bin/kmdb-pool ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ exec(
4
+ 'foreman', 'start',
5
+ '--procfile=%s' % File.expand_path('../../Procfile.work', __FILE__),
6
+ '--formation=resque=%s' % ENV.fetch('RESQUE_WORKERS', 1)
7
+ )
8
+
data/bin/kmdb-realias ADDED
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env ruby
2
+ =begin
3
+
4
+ Import KM events from the raw dumps.
5
+
6
+ =end
7
+ require 'kmdb'
8
+ require 'kmdb/resque'
9
+ require 'kmdb/jobs/redo_unaliasing'
10
+
11
+ KMDB::Resque.enqueue(KMDB::Jobs::RedoUnaliasing, Date.today.to_s)
12
+
data/bin/kmdb-ui ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ exec('rackup',
4
+ '--port', ENV.fetch('PORT'),
5
+ File.expand_path('../../config.ru', __FILE__)
6
+ )
data/bin/kmdb-work ADDED
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env ruby
2
+ require 'kmdb'
3
+ require 'kmdb/resque'
4
+
5
+ # suppress a silly warning
6
+ require 'i18n'
7
+ I18n.enforce_available_locales = false
8
+
9
+ # load all jobs
10
+ Dir[File.expand_path('../../lib/kmdb/jobs/*.rb', __FILE__)].each do |job|
11
+ require job
12
+ end
13
+
14
+
15
+ KMDB.connect
16
+ KMDB::Resque.work
17
+