km-db 0.2.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +7 -0
  3. data/.ruby-version +1 -0
  4. data/Gemfile +2 -4
  5. data/Gemfile.lock +179 -20
  6. data/Procfile +2 -0
  7. data/Procfile.work +1 -0
  8. data/README.md +186 -0
  9. data/Rakefile +1 -0
  10. data/bin/kmdb-flush +13 -0
  11. data/bin/kmdb-import +13 -0
  12. data/bin/kmdb-partition +15 -0
  13. data/bin/kmdb-pool +8 -0
  14. data/bin/kmdb-realias +12 -0
  15. data/bin/kmdb-ui +6 -0
  16. data/bin/kmdb-work +17 -0
  17. data/config/amazon-rds-ca-cert.pem +260 -0
  18. data/config.ru +8 -0
  19. data/km-db.gemspec +17 -17
  20. data/lib/kmdb/{belongs_to_user.rb → concerns/belongs_to_user.rb} +3 -3
  21. data/lib/kmdb/concerns/has_properties.rb +35 -0
  22. data/lib/kmdb/jobs/find_files.rb +32 -0
  23. data/lib/kmdb/jobs/list_files.rb +37 -0
  24. data/lib/kmdb/jobs/locked.rb +10 -0
  25. data/lib/kmdb/jobs/parse_file.rb +109 -0
  26. data/lib/kmdb/jobs/record_batch.rb +65 -0
  27. data/lib/kmdb/jobs/redo_unaliasing.rb +31 -0
  28. data/lib/kmdb/jobs/unalias_user.rb +32 -0
  29. data/lib/kmdb/migrations/01_kmdb_initial.rb +78 -0
  30. data/lib/kmdb/migrations/02_kmdb_partitions.rb +28 -0
  31. data/lib/kmdb/migrations/03_kmdb_blacklist.rb +20 -0
  32. data/lib/kmdb/models/alias.rb +36 -0
  33. data/lib/kmdb/models/blacklisted_property.rb +20 -0
  34. data/lib/kmdb/models/custom_record.rb +53 -0
  35. data/lib/kmdb/models/dumpfile.rb +33 -0
  36. data/lib/kmdb/models/event.rb +56 -0
  37. data/lib/kmdb/models/event_batch.rb +72 -0
  38. data/lib/kmdb/models/global_uid.rb +42 -0
  39. data/lib/kmdb/models/ignored_user.rb +20 -0
  40. data/lib/kmdb/models/json_file.rb +56 -0
  41. data/lib/kmdb/models/key.rb +28 -0
  42. data/lib/kmdb/models/property.rb +44 -0
  43. data/lib/kmdb/models/s3_object.rb +54 -0
  44. data/lib/kmdb/models/user.rb +53 -0
  45. data/lib/kmdb/models/whitelisted_event.rb +20 -0
  46. data/lib/kmdb/parser.rb +4 -4
  47. data/lib/kmdb/redis.rb +17 -0
  48. data/lib/kmdb/resque.rb +38 -0
  49. data/lib/kmdb/s3_bucket.rb +33 -0
  50. data/lib/kmdb/services/partitioner.rb +65 -0
  51. data/lib/kmdb/version.rb +1 -1
  52. data/lib/kmdb.rb +31 -6
  53. metadata +236 -186
  54. data/README.markdown +0 -91
  55. data/bin/km_db_import +0 -36
  56. data/lib/kmdb/custom_record.rb +0 -54
  57. data/lib/kmdb/dumpfile.rb +0 -23
  58. data/lib/kmdb/event.rb +0 -39
  59. data/lib/kmdb/has_properties.rb +0 -33
  60. data/lib/kmdb/key.rb +0 -56
  61. data/lib/kmdb/migration.rb +0 -63
  62. data/lib/kmdb/parallel_parser.rb +0 -85
  63. data/lib/kmdb/property.rb +0 -33
  64. data/lib/kmdb/user.rb +0 -83
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a1d53c84fa85914d141e481f75759df0a9cdaad0
4
+ data.tar.gz: 58605a95792bfd91bc5b3877b1460836cc883086
5
+ SHA512:
6
+ metadata.gz: 5dfb47a0f99adb83c288792df7fbe08b37474048844e966f604180da8c5885715e6aa7a04c866b50e257df5f3db322546697bfba9b11400342a2a33690a6a461
7
+ data.tar.gz: 1b14b242a34d432516402b47e7ac1e7db70b82afe1f7761aaa23a43915226f99f47cc7d21a06d2a9ad417a0fff843fd44c784fb20d64d87d6aff25538728419e
data/.gitignore ADDED
@@ -0,0 +1,7 @@
1
+ data/
2
+ tmp/
3
+ pkg/
4
+ tags
5
+ .DS_Store
6
+ .bundle
7
+
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 2.1.2
data/Gemfile CHANGED
@@ -1,6 +1,4 @@
1
- source :gemcutter
1
+ source ENV.fetch('GEM_SOURCE', 'https://rubygems.org')
2
2
 
3
- gem 'progressbar'
4
-
5
- # Specify your gem's dependencies in km.gemspec
6
3
  gemspec
4
+
data/Gemfile.lock CHANGED
@@ -1,25 +1,175 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- km-db (0.2.1)
5
- activerecord (~> 2.3.12)
4
+ km-db (0.3.2)
5
+ activerecord (~> 4.1)
6
6
  andand
7
- parallel
7
+ fog
8
+ foreman
9
+ mysql2
10
+ oj
8
11
  progressbar
9
- yajl-ruby
12
+ resque
13
+ resque-lock
10
14
 
11
15
  GEM
12
- remote: http://rubygems.org/
16
+ remote: https://rubygems.org/
13
17
  specs:
14
- activerecord (2.3.18)
15
- activesupport (= 2.3.18)
16
- activesupport (2.3.18)
18
+ CFPropertyList (2.3.1)
19
+ activemodel (4.2.2)
20
+ activesupport (= 4.2.2)
21
+ builder (~> 3.1)
22
+ activerecord (4.2.2)
23
+ activemodel (= 4.2.2)
24
+ activesupport (= 4.2.2)
25
+ arel (~> 6.0)
26
+ activesupport (4.2.2)
27
+ i18n (~> 0.7)
28
+ json (~> 1.7, >= 1.7.7)
29
+ minitest (~> 5.1)
30
+ thread_safe (~> 0.3, >= 0.3.4)
31
+ tzinfo (~> 1.1)
17
32
  andand (1.3.3)
33
+ arel (6.0.0)
34
+ builder (3.2.2)
35
+ coderay (1.1.0)
18
36
  diff-lcs (1.1.3)
19
- json (1.7.7)
20
- parallel (0.6.3)
21
- progressbar (0.20.0)
22
- rake (10.0.3)
37
+ excon (0.45.3)
38
+ fission (0.5.0)
39
+ CFPropertyList (~> 2.2)
40
+ fog (1.31.0)
41
+ fog-atmos
42
+ fog-aws (~> 0.0)
43
+ fog-brightbox (~> 0.4)
44
+ fog-core (~> 1.30)
45
+ fog-ecloud
46
+ fog-google (>= 0.0.2)
47
+ fog-json
48
+ fog-local
49
+ fog-powerdns (>= 0.1.1)
50
+ fog-profitbricks
51
+ fog-radosgw (>= 0.0.2)
52
+ fog-riakcs
53
+ fog-sakuracloud (>= 0.0.4)
54
+ fog-serverlove
55
+ fog-softlayer
56
+ fog-storm_on_demand
57
+ fog-terremark
58
+ fog-vmfusion
59
+ fog-voxel
60
+ fog-xml (~> 0.1.1)
61
+ ipaddress (~> 0.5)
62
+ nokogiri (~> 1.5, >= 1.5.11)
63
+ fog-atmos (0.1.0)
64
+ fog-core
65
+ fog-xml
66
+ fog-aws (0.5.0)
67
+ fog-core (~> 1.27)
68
+ fog-json (~> 1.0)
69
+ fog-xml (~> 0.1)
70
+ ipaddress (~> 0.8)
71
+ fog-brightbox (0.7.1)
72
+ fog-core (~> 1.22)
73
+ fog-json
74
+ inflecto (~> 0.0.2)
75
+ fog-core (1.31.1)
76
+ builder
77
+ excon (~> 0.45)
78
+ formatador (~> 0.2)
79
+ mime-types
80
+ net-scp (~> 1.1)
81
+ net-ssh (>= 2.1.3)
82
+ fog-ecloud (0.1.3)
83
+ fog-core
84
+ fog-xml
85
+ fog-google (0.0.5)
86
+ fog-core
87
+ fog-json
88
+ fog-xml
89
+ fog-json (1.0.2)
90
+ fog-core (~> 1.0)
91
+ multi_json (~> 1.10)
92
+ fog-local (0.2.1)
93
+ fog-core (~> 1.27)
94
+ fog-powerdns (0.1.1)
95
+ fog-core (~> 1.27)
96
+ fog-json (~> 1.0)
97
+ fog-xml (~> 0.1)
98
+ fog-profitbricks (0.0.3)
99
+ fog-core
100
+ fog-xml
101
+ nokogiri
102
+ fog-radosgw (0.0.4)
103
+ fog-core (>= 1.21.0)
104
+ fog-json
105
+ fog-xml (>= 0.0.1)
106
+ fog-riakcs (0.1.0)
107
+ fog-core
108
+ fog-json
109
+ fog-xml
110
+ fog-sakuracloud (1.0.1)
111
+ fog-core
112
+ fog-json
113
+ fog-serverlove (0.1.2)
114
+ fog-core
115
+ fog-json
116
+ fog-softlayer (0.4.6)
117
+ fog-core
118
+ fog-json
119
+ fog-storm_on_demand (0.1.1)
120
+ fog-core
121
+ fog-json
122
+ fog-terremark (0.1.0)
123
+ fog-core
124
+ fog-xml
125
+ fog-vmfusion (0.1.0)
126
+ fission
127
+ fog-core
128
+ fog-voxel (0.1.0)
129
+ fog-core
130
+ fog-xml
131
+ fog-xml (0.1.2)
132
+ fog-core
133
+ nokogiri (~> 1.5, >= 1.5.11)
134
+ foreman (0.78.0)
135
+ thor (~> 0.19.1)
136
+ formatador (0.2.5)
137
+ i18n (0.7.0)
138
+ inflecto (0.0.2)
139
+ ipaddress (0.8.0)
140
+ json (1.8.3)
141
+ method_source (0.8.2)
142
+ mime-types (2.6.1)
143
+ mini_portile (0.6.2)
144
+ minitest (5.7.0)
145
+ mono_logger (1.1.0)
146
+ multi_json (1.11.1)
147
+ mysql2 (0.3.18)
148
+ net-scp (1.2.1)
149
+ net-ssh (>= 2.6.5)
150
+ net-ssh (2.9.2)
151
+ nokogiri (1.6.6.2)
152
+ mini_portile (~> 0.6.0)
153
+ oj (2.12.9)
154
+ progressbar (0.21.0)
155
+ pry (0.10.0)
156
+ coderay (~> 1.1.0)
157
+ method_source (~> 0.8.1)
158
+ slop (~> 3.4)
159
+ rack (1.6.2)
160
+ rack-protection (1.5.3)
161
+ rack
162
+ rake (10.3.2)
163
+ redis (3.2.1)
164
+ redis-namespace (1.5.2)
165
+ redis (~> 3.0, >= 3.0.4)
166
+ resque (1.25.2)
167
+ mono_logger (~> 1.0)
168
+ multi_json (~> 1.0)
169
+ redis-namespace (~> 1.3)
170
+ sinatra (>= 0.9.2)
171
+ vegas (~> 0.1.2)
172
+ resque-lock (1.1.0)
23
173
  rspec (2.4.0)
24
174
  rspec-core (~> 2.4.0)
25
175
  rspec-expectations (~> 2.4.0)
@@ -28,19 +178,28 @@ GEM
28
178
  rspec-expectations (2.4.0)
29
179
  diff-lcs (~> 1.1.2)
30
180
  rspec-mocks (2.4.0)
31
- sqlite3 (1.3.7)
32
- sqlite3-ruby (1.3.3)
33
- sqlite3 (>= 1.3.3)
34
- yajl-ruby (1.1.0)
181
+ sinatra (1.4.6)
182
+ rack (~> 1.4)
183
+ rack-protection (~> 1.4)
184
+ tilt (>= 1.3, < 3)
185
+ slop (3.5.0)
186
+ thor (0.19.1)
187
+ thread_safe (0.3.5)
188
+ tilt (2.0.1)
189
+ tzinfo (1.2.2)
190
+ thread_safe (~> 0.1)
191
+ vegas (0.1.11)
192
+ rack (>= 1.0.0)
35
193
 
36
194
  PLATFORMS
37
195
  ruby
38
196
 
39
197
  DEPENDENCIES
40
- bundler (>= 1.0.0)
41
- json
198
+ bundler
42
199
  km-db!
43
- progressbar
200
+ pry
44
201
  rake
45
202
  rspec (~> 2.4.0)
46
- sqlite3-ruby
203
+
204
+ BUNDLED WITH
205
+ 1.10.4
data/Procfile ADDED
@@ -0,0 +1,2 @@
1
+ web: bundle exec rackup
2
+ workers: foreman start -f Procfile.work -c worker=$RESQUE_WORKERS
data/Procfile.work ADDED
@@ -0,0 +1 @@
1
+ resque: bundle exec bin/kmdb-work
data/README.md ADDED
@@ -0,0 +1,186 @@
1
+ # KMDB
2
+
3
+ The `km-db` gem should be useful to KissMetrics (KM) users.
4
+ Its aim is to efficiently process data obtained with KM's "Data Export" feature.
5
+
6
+ Its main feature is to import dumps directly from S3 into a SQL database,
7
+ optimized for typical queries (in particular, partitioned along the time
8
+ dimension).
9
+
10
+ Once imported, you can run complex queries against your visit history, for
11
+ instance run multivariate analysis.
12
+
13
+ Beware though, KM data can be huge, and processing it is taxing!
14
+
15
+
16
+ ## Installing
17
+
18
+ If you want to run "just" KM-DB, you might want to just use [the
19
+ app](https://github.com/HouseTrip/km-db-app).
20
+
21
+ Otherwise, add this to your Gemfile if you're using Bundler:
22
+
23
+ gem 'km-db'
24
+
25
+ ### Configuration
26
+
27
+ KMDB is configured through environment variables. We recommend storing this
28
+ settings in a `.env` file if running locally, and using [foreman]() to start
29
+ KMDB commands with the environment set.
30
+
31
+
32
+ ### Preparing your database
33
+
34
+ KMDB requires a MySQL database (to store events, properties, etc) and a Redis
35
+ store running (to store batch jobs and cache data).
36
+
37
+ Set the following:
38
+
39
+ - `DATABASE_URL` (required), e.g. `mysql2://km_db_test@localhost/km_db_test`
40
+ - `KMDB_REDIS_URL` [localhost], e.g. `redis://localhost/14`
41
+
42
+ Then run:
43
+
44
+ $ kmdb-flush
45
+
46
+ to prepare your database.
47
+
48
+
49
+ ### Optimizing your database
50
+
51
+ If your dataset is large (over 1 million events), KMDB can [partition]() your
52
+ database, i.e. transparently split large tables into smaller buckets of
53
+ continuous time periods.
54
+
55
+ Set the following:
56
+
57
+ - `KMDB_MIN_DATE` (required), e.g. '2014-01-01'
58
+ - `KMDB_MAX_DATE` (required), e.g. '2016-01-01'
59
+ - `KMDB_DAYS_PER_PARTITION` (required), e.g. '7'
60
+
61
+ Then run:
62
+
63
+ $ kmdb-partition
64
+
65
+ Notes:
66
+
67
+ - MySQL only supports up to 1024 partitions.
68
+ - You shoud aim for less than 1 million events per partitions for performance.
69
+ - You should run this _before_ importing data, but it's possible to re-run it.
70
+ The `MIN_DATE` will be ignored, and partitions will be added up to the new
71
+ `MAX_DATE` (if larger).
72
+
73
+
74
+ ## Importing data
75
+
76
+ KMDB will fetch JSON files form the S3 bucket where you instructed KissMetrics
77
+ to back up your data, parse them, and store information in the database.
78
+
79
+ It does so using [resque]() for high parallelism of the import process; in our
80
+ experience, it's perfectly possible to import 100GB of data in a few hours.
81
+
82
+ Set the following:
83
+
84
+ - `RESQUE_WORKERS` (1), number of worker nodes.
85
+ - `KMDB_MIN_REVISION` (optional, default 1), first KissMetrics revision file you want to import.
86
+ - `KMDB_REVISION_LOOKAHEAD` (10), how many revision files to check after the last known one
87
+ - `KMDB_BATCH_SIZE` (100), how many events to process per batch (advisory, may
88
+ be higher as an entire second's worth of events will always be processed in one
89
+ batch to preserve ordering).
90
+ - `AWS_BUCKET` (required), the name of the S3 bucket where the data is stored.
91
+ - `AWS_ACCESS_KEY_ID` (required).
92
+ - `AWS_SECRET_ACCESS_KEY` (required).
93
+
94
+
95
+ ### Ignoring some users
96
+
97
+ You may want to ignore all events and properties for certain users, for instance
98
+ the administrative users of your site (or employees).
99
+
100
+ Simply add their identities to the `ignored_users` table before import.
101
+
102
+
103
+ ### Whitelisting events
104
+
105
+ It's typical to have some noisy and/or shorter-lived events sent to KissMetrics,
106
+ e.g. for testing purposes or for temporary monitoring.
107
+
108
+ Should you only want to import certain events, add their names to the
109
+ `whitelisted_events` table before starting import.
110
+
111
+ If the table is left empty, all events will be imported.
112
+
113
+
114
+ ### Dealiasing users
115
+
116
+ When KissMetrics finds a way to tie two user identities as being a single actual
117
+ user, it stores an "aliasing" event.
118
+ KMDB de-aliases users automatically during import, and will store all events and
119
+ properties against a single user identity (one that's numeric if any, otherwise
120
+ the lexicographically lowest).
121
+
122
+
123
+ ## Using imported data
124
+
125
+ ### Using SQL directly
126
+
127
+ KMDB tries to stay close to the KissMetrics data, leaving you to interpret it.
128
+ As such, the main tables are unsurprisingly `events` and `properties`.
129
+
130
+ Here's a summary of the data model:
131
+
132
+ `events` has one row for each imported event:
133
+
134
+ | **events** |
135
+ |------------|
136
+ | id |
137
+ | t | the event timestamp |
138
+ | n | reference to the event name |
139
+ | user_id | reference to the user |
140
+
141
+ `properties` has one row for each property ever set on events or users
142
+
143
+ | **properties** |
144
+ |----------------|
145
+ | id |
146
+ | t | timestamp at which the property was set |
147
+ | key | reference to the property name |
148
+ | value | value (string) |
149
+ | user_id | reference to the user |
150
+ | event_id | reference to the event (may be NULL) |
151
+
152
+ `events.n` and `properties.key` reference the `id` column of the `keys` table;
153
+ this is done for performance reasons (event and property names are only stored
154
+ once):
155
+
156
+ | **keys** |
157
+ |----------|
158
+ | id |
159
+ | string |
160
+
161
+ KMDB also keeps the original user identities around in `users`, although you'll
162
+ probably never need them:
163
+
164
+ | **users** |
165
+ |-----------|
166
+ | id |
167
+ | name | the identity given by KissMetrics |
168
+
169
+ as well as all aliasing events:
170
+
171
+ | **aliases** |
172
+ |-------------|
173
+ | id |
174
+ | name1 |
175
+ | name2 |
176
+
177
+
178
+ ### Using ActiveRecord
179
+
180
+ The `KMDB` module exposes four `ActiveRecord` classes:
181
+ `Event`, `Property`, `User` are the main domain objects.
182
+
183
+ `Key` is used to intern strings (event and property names) for performance.
184
+
185
+ Please consult the source of these models for details.
186
+
data/Rakefile CHANGED
@@ -3,3 +3,4 @@ require 'rspec/core/rake_task'
3
3
  Bundler::GemHelper.install_tasks
4
4
 
5
5
  RSpec::Core::RakeTask.new(:spec)
6
+
data/bin/kmdb-flush ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+ require 'kmdb'
3
+ require 'kmdb/redis'
4
+
5
+ KMDB.connect.migrate
6
+
7
+ KMDB.transaction do |c|
8
+ %w(aliases dumpfiles events properties users).each do |table|
9
+ c.execute "TRUNCATE TABLE #{table}"
10
+ end
11
+ end
12
+
13
+ KMDB::Redis.connection.flushdb
data/bin/kmdb-import ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+ =begin
3
+
4
+ Import KM events from the raw dumps.
5
+
6
+ =end
7
+ require 'kmdb'
8
+ require 'kmdb/resque'
9
+ require 'kmdb/jobs/find_files'
10
+ require 'kmdb/jobs/list_files'
11
+
12
+ KMDB::Resque.enqueue(KMDB::Jobs::FindFiles)
13
+
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ require 'kmdb'
3
+ require 'kmdb/models/event'
4
+ require 'kmdb/models/alias'
5
+ require 'kmdb/models/property'
6
+ require 'kmdb/services/partitioner'
7
+
8
+ KMDB.connect.migrate
9
+
10
+ module KMDB
11
+ [Event, Property, Alias].each do |model|
12
+ Services::Partitioner.new(model: model).run
13
+ end
14
+ end
15
+
data/bin/kmdb-pool ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ exec(
4
+ 'foreman', 'start',
5
+ '--procfile=%s' % File.expand_path('../../Procfile.work', __FILE__),
6
+ '--formation=resque=%s' % ENV.fetch('RESQUE_WORKERS', 1)
7
+ )
8
+
data/bin/kmdb-realias ADDED
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env ruby
2
+ =begin
3
+
4
+ Import KM events from the raw dumps.
5
+
6
+ =end
7
+ require 'kmdb'
8
+ require 'kmdb/resque'
9
+ require 'kmdb/jobs/redo_unaliasing'
10
+
11
+ KMDB::Resque.enqueue(KMDB::Jobs::RedoUnaliasing, Date.today.to_s)
12
+
data/bin/kmdb-ui ADDED
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ exec('rackup',
4
+ '--port', ENV.fetch('PORT'),
5
+ File.expand_path('../../config.ru', __FILE__)
6
+ )
data/bin/kmdb-work ADDED
@@ -0,0 +1,17 @@
1
+ #!/usr/bin/env ruby
2
+ require 'kmdb'
3
+ require 'kmdb/resque'
4
+
5
+ # suppress a silly warning
6
+ require 'i18n'
7
+ I18n.enforce_available_locales = false
8
+
9
+ # load all jobs
10
+ Dir[File.expand_path('../../lib/kmdb/jobs/*.rb', __FILE__)].each do |job|
11
+ require job
12
+ end
13
+
14
+
15
+ KMDB.connect
16
+ KMDB::Resque.work
17
+