km-db 0.2.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +7 -0
  3. data/.ruby-version +1 -0
  4. data/Gemfile +2 -4
  5. data/Gemfile.lock +179 -20
  6. data/Procfile +2 -0
  7. data/Procfile.work +1 -0
  8. data/README.md +186 -0
  9. data/Rakefile +1 -0
  10. data/bin/kmdb-flush +13 -0
  11. data/bin/kmdb-import +13 -0
  12. data/bin/kmdb-partition +15 -0
  13. data/bin/kmdb-pool +8 -0
  14. data/bin/kmdb-realias +12 -0
  15. data/bin/kmdb-ui +6 -0
  16. data/bin/kmdb-work +17 -0
  17. data/config/amazon-rds-ca-cert.pem +260 -0
  18. data/config.ru +8 -0
  19. data/km-db.gemspec +17 -17
  20. data/lib/kmdb/{belongs_to_user.rb → concerns/belongs_to_user.rb} +3 -3
  21. data/lib/kmdb/concerns/has_properties.rb +35 -0
  22. data/lib/kmdb/jobs/find_files.rb +32 -0
  23. data/lib/kmdb/jobs/list_files.rb +37 -0
  24. data/lib/kmdb/jobs/locked.rb +10 -0
  25. data/lib/kmdb/jobs/parse_file.rb +109 -0
  26. data/lib/kmdb/jobs/record_batch.rb +65 -0
  27. data/lib/kmdb/jobs/redo_unaliasing.rb +31 -0
  28. data/lib/kmdb/jobs/unalias_user.rb +32 -0
  29. data/lib/kmdb/migrations/01_kmdb_initial.rb +78 -0
  30. data/lib/kmdb/migrations/02_kmdb_partitions.rb +28 -0
  31. data/lib/kmdb/migrations/03_kmdb_blacklist.rb +20 -0
  32. data/lib/kmdb/models/alias.rb +36 -0
  33. data/lib/kmdb/models/blacklisted_property.rb +20 -0
  34. data/lib/kmdb/models/custom_record.rb +53 -0
  35. data/lib/kmdb/models/dumpfile.rb +33 -0
  36. data/lib/kmdb/models/event.rb +56 -0
  37. data/lib/kmdb/models/event_batch.rb +72 -0
  38. data/lib/kmdb/models/global_uid.rb +42 -0
  39. data/lib/kmdb/models/ignored_user.rb +20 -0
  40. data/lib/kmdb/models/json_file.rb +56 -0
  41. data/lib/kmdb/models/key.rb +28 -0
  42. data/lib/kmdb/models/property.rb +44 -0
  43. data/lib/kmdb/models/s3_object.rb +54 -0
  44. data/lib/kmdb/models/user.rb +53 -0
  45. data/lib/kmdb/models/whitelisted_event.rb +20 -0
  46. data/lib/kmdb/parser.rb +4 -4
  47. data/lib/kmdb/redis.rb +17 -0
  48. data/lib/kmdb/resque.rb +38 -0
  49. data/lib/kmdb/s3_bucket.rb +33 -0
  50. data/lib/kmdb/services/partitioner.rb +65 -0
  51. data/lib/kmdb/version.rb +1 -1
  52. data/lib/kmdb.rb +31 -6
  53. metadata +236 -186
  54. data/README.markdown +0 -91
  55. data/bin/km_db_import +0 -36
  56. data/lib/kmdb/custom_record.rb +0 -54
  57. data/lib/kmdb/dumpfile.rb +0 -23
  58. data/lib/kmdb/event.rb +0 -39
  59. data/lib/kmdb/has_properties.rb +0 -33
  60. data/lib/kmdb/key.rb +0 -56
  61. data/lib/kmdb/migration.rb +0 -63
  62. data/lib/kmdb/parallel_parser.rb +0 -85
  63. data/lib/kmdb/property.rb +0 -33
  64. data/lib/kmdb/user.rb +0 -83
data/lib/kmdb/event.rb DELETED
@@ -1,39 +0,0 @@
1
- require 'kmdb/custom_record'
2
- require 'kmdb/belongs_to_user'
3
- require 'kmdb/has_properties'
4
-
5
- module KMDB
6
- class Event < CustomRecord
7
- include BelongsToUser
8
- include HasProperties
9
-
10
- set_table_name "events"
11
-
12
- named_scope :before, lambda { |date| { :conditions => ["`#{table_name}`.`t` < ?", date] } }
13
- named_scope :after, lambda { |date| { :conditions => ["`#{table_name}`.`t` > ?", date] } }
14
-
15
- named_scope :named, lambda { |name| { :conditions => { :n => KMDB::Key.get(name) } } }
16
-
17
- named_scope :by_date, lambda { { :order => "`#{table_name}`.`t` ASC" } }
18
-
19
- # return value of property
20
- def prop(name)
21
- properties.named(name).first.andand.value
22
- end
23
-
24
- def name
25
- KMDB::Key.find(n).value
26
- end
27
-
28
- def self.record(hash)
29
- user_name = hash.delete('_p')
30
- user ||= User.get(user_name)
31
- raise UserError.new "User missing for '#{user_name}'" unless user.present?
32
-
33
- stamp = Time.at hash.delete('_t')
34
- key = Key.get hash.delete('_n')
35
- event = create(:t => stamp, :n => key, :user => user)
36
- Property.set(hash, stamp, user, event)
37
- end
38
- end
39
- end
@@ -1,33 +0,0 @@
1
- =begin
2
-
3
- KMDB::HasProperties --
4
-
5
- Trait shared by Event and User.
6
-
7
- =end
8
-
9
- module KMDB
10
- module HasProperties
11
- def self.included(mod)
12
- mod.class_eval do
13
- has_many :properties, :class_name => 'KMDB::Property'
14
-
15
- named_scope :with_properties, lambda { |*props|
16
- direction = props.delete(:exclude_missing) ? 'INNER' : 'LEFT'
17
- prop_table = Property.table_name
18
- selects = ["`#{table_name}`.*"]
19
- joins = []
20
- props.each_with_index { |prop,k|
21
- temp_name = "#{prop_table}_#{k}"
22
- selects << "`#{temp_name}`.`value` AS `#{prop.split.join('_')}`"
23
- joins << sanitize_sql_array([%Q{
24
- #{direction} JOIN `properties` AS `#{temp_name}`
25
- ON `#{table_name}`.id = `#{temp_name}`.event_id
26
- AND `#{temp_name}`.`key` = ?}, KMDB::Key.get(prop)])
27
- }
28
- { :select => selects.join(', '), :joins => joins.join("\n") }
29
- }
30
- end
31
- end
32
- end
33
- end
data/lib/kmdb/key.rb DELETED
@@ -1,56 +0,0 @@
1
- =begin
2
-
3
- Map strings (event and property names) to unique integers (Key#id) for performance
4
-
5
- =end
6
-
7
- require 'kmdb/custom_record'
8
-
9
- module KMDB
10
- class Key < CustomRecord
11
- set_table_name "keys"
12
-
13
- has_many :events, :foreign_key => :n, :class_name => 'KMDB::Event', :dependent => :delete_all
14
- has_many :properties, :foreign_key => :key, :class_name => 'KMDB::Property', :dependent => :delete_all
15
-
16
- named_scope :has_duplicate, lambda {
17
- {
18
- :select => "id, string, COUNT(id) AS quantity",
19
- :group => :string, :having => "quantity > 1"
20
- }
21
- }
22
-
23
- def self.get(string)
24
- @cache ||= {}
25
- @cache[string] ||= get_uncached(string)
26
- end
27
-
28
- # Replace each duplicate key ID with its most-used variant
29
- def self.fix_duplicates!
30
- has_duplicate.map(&:string).each do |string|
31
- all_keys = find(:all, :conditions => { :string => string })
32
-
33
- # sort keys by usage
34
- all_ids = all_keys.map { |key|
35
- [key.id, Event.named(key.id).count + Property.named(key.id).count]
36
- }.sort { |k1,k2|
37
- k1.second <=> k2.second
38
- }.map { |k|
39
- k.first
40
- }
41
- id_to_keep = all_ids.pop
42
- $stderr.write "Fixing key '#{string}' #{all_ids.inspect} -> #{id_to_keep.inspect}\n"
43
- Event.update_all({ :n => id_to_keep }, ["`events`.`n` IN (?)", all_ids])
44
- Property.update_all({ :key => id_to_keep }, ["`properties`.`key` IN (?)", all_ids])
45
- Key.delete_all(["id IN (?)", all_ids])
46
- end
47
- end
48
-
49
- private
50
-
51
- def self.get_uncached(string)
52
- string.size <= MaxStringSize or raise "String is too long"
53
- find_or_create(:string => string).id
54
- end
55
- end
56
- end
@@ -1,63 +0,0 @@
1
- =begin
2
-
3
- Setup a custom database for KissMetrics tracking events.
4
-
5
- =end
6
-
7
- require 'active_record'
8
-
9
- module KMDB
10
- class SetupEventsDatabase < ActiveRecord::Migration
11
- def self.connection
12
- CustomRecord.connection
13
- end
14
-
15
- def self.up
16
- create_table :events do |t|
17
- t.integer :user_id
18
- t.integer :n
19
- t.datetime :t
20
- end
21
- add_index :events, [:n]
22
- add_index :events, [:user_id]
23
-
24
-
25
- create_table :keys do |t|
26
- t.string :string, :limit => MaxStringSize
27
- end
28
- add_index :keys, [:string]
29
-
30
- create_table :properties do |t|
31
- t.integer :user_id
32
- t.integer :event_id
33
- t.integer :key
34
- t.string :value, :limit => 64
35
- t.datetime :t
36
- end
37
- add_index :properties, [:key]
38
- add_index :properties, [:user_id]
39
- add_index :properties, [:event_id]
40
-
41
- create_table :users do |t|
42
- t.string :name, :limit => 48
43
- t.integer :alias_id
44
- end
45
- add_index :users, [:name]
46
-
47
- create_table :dumpfiles do |t|
48
- t.string :path
49
- t.string :job
50
- t.integer :offset
51
- end
52
- add_index :dumpfiles, [:path]
53
-
54
- end
55
-
56
- def self.down
57
- drop_table :events
58
- drop_table :properties
59
- drop_table :users
60
- drop_table :aliases
61
- end
62
- end
63
- end
@@ -1,85 +0,0 @@
1
- require 'kmdb/parser'
2
- require 'parallel'
3
-
4
- module KMDB
5
- class ParallelParser < Parser
6
-
7
- def initialize(options = {})
8
- super(options)
9
- @worker_count = options.delete(:workers) || Parallel.processor_count
10
- end
11
-
12
- def run(argv)
13
- @pipe_rd, @pipe_wr = IO.pipe
14
-
15
- inputs = list_files_in(argv)
16
- total_bytes = total_size_of_files(inputs)
17
- log "total bytes : #{total_bytes}"
18
- total_bytes -= inputs.map { |p| Dumpfile.get(p, @resume_job) }.compact.map(&:offset).sum
19
- log "left to process : #{total_bytes}"
20
-
21
- # Start workers
22
- log "Using #{@worker_count} workers."
23
- Process.fork do
24
- @pipe_rd.close
25
- Parallel.each(inputs, :in_processes => @worker_count) do |input|
26
- KMDB::Event.connection.reconnect!
27
- log "Worker #{Process.pid} starting #{input}"
28
- $0 = "worker: #{input}"
29
- process_events_in_file(input)
30
- log "Worker #{Process.pid} done"
31
- true
32
- end
33
- end
34
-
35
- # Start gatherer
36
- $0 = "gatherer: #{$0}"
37
- @pipe_wr.close
38
- byte_counter = 0
39
- log "Starting gatherer, total bytes: #{total_bytes}"
40
- progress = ProgressBar.new("-" * 20, total_bytes)
41
- while line = @pipe_rd.gets
42
- if line =~ /^OK (\d+)$/
43
- byte_counter += $1.to_i
44
- progress.set byte_counter
45
- elsif line =~ /^FILE (.*)$/
46
- progress.title = $1
47
- else
48
- log "Unparsed line: '#{line}'"
49
- end
50
- end
51
- progress.finish
52
- log "Total bytes processed: #{byte_counter}"
53
- Process.waitall
54
- end
55
-
56
- private
57
-
58
- def process_events_in_file(pathname)
59
- pathname.open do |input|
60
- processed_bytes = 0
61
- if @resume_job
62
- dumpfile = Dumpfile.get(pathname, @resume_job)
63
- log "Starting file #{pathname} from offset #{dumpfile.offset}"
64
- input.seek(dumpfile.offset)
65
- end
66
- line_number = 0
67
- @pipe_wr.write "FILE #{pathname.basename}\n"
68
- while line = input.gets
69
- line_number += 1
70
- processed_bytes += line.size
71
-
72
- process_event(line)
73
- dumpfile.set(input.tell)
74
-
75
- if processed_bytes > 100_000
76
- @pipe_wr.write "OK #{processed_bytes}\n"
77
- processed_bytes = 0
78
- end
79
- end
80
- @pipe_wr.write "OK #{processed_bytes}\n"
81
- end
82
- end
83
-
84
- end
85
- end
data/lib/kmdb/property.rb DELETED
@@ -1,33 +0,0 @@
1
- require 'kmdb/belongs_to_user'
2
-
3
- module KMDB
4
- class Property < CustomRecord
5
- include BelongsToUser
6
-
7
- set_table_name "properties"
8
- belongs_to :event, :class_name => 'KMDB::Event'
9
-
10
- default_scope :order => 't DESC'
11
- named_scope :named, lambda { |name| { :conditions => { :key => KMDB::Key.get(name) } } }
12
-
13
- def self.set(hash, stamp=nil, user=nil, event=nil)
14
- user_name = hash.delete('_p')
15
- user ||= User.get(user_name)
16
- raise UserError.new "User missing for '#{user_name}'" unless user.present?
17
-
18
- event_id = event ? event.id : nil
19
- stamp = Time.at hash.delete('_t') || stamp
20
-
21
- return if hash.empty?
22
- sql_insert = "INSERT INTO `#{table_name}` (`t`,`user_id`,`event_id`,`key`,`value`) VALUES "
23
- sql_values = []
24
-
25
- hash.each_pair do |prop_name,value|
26
- key = Key.get(prop_name)
27
- sql_values << sanitize_sql_array(["(?,?,?,?,?)", stamp,user.id,event_id,key,value])
28
- end
29
-
30
- connection.execute(sql_insert + sql_values.join(","))
31
- end
32
- end
33
- end
data/lib/kmdb/user.rb DELETED
@@ -1,83 +0,0 @@
1
- require 'kmdb/has_properties'
2
-
3
- module KMDB
4
- class User < CustomRecord
5
- include HasProperties
6
-
7
- set_table_name "users"
8
-
9
- has_many :events, :class_name => 'KMDB::Event'
10
- belongs_to :alias, :class_name => 'KMDB::User'
11
- # points to the aliased user. if set, no properties/events should belong to this user
12
-
13
- validates_presence_of :name
14
- validates_uniqueness_of :name
15
-
16
- named_scope :named, lambda { |name| { :conditions => { :name => name } } }
17
-
18
- named_scope :duplicates, lambda {{
19
- :select => "id, COUNT(id) AS quantity", :group => :name, :having => "quantity > 1"
20
- }}
21
-
22
- # return (latest) value of property
23
- def prop(name)
24
- properties.named(name).first.andand.value
25
- end
26
-
27
- # mark this user as aliasing another
28
- def aliases!(other)
29
- [Property,Event].each do |model|
30
- model.user_is(self).update_all({:user_id => other.id})
31
- end
32
- self.update_attributes!(:alias => other)
33
- end
34
-
35
- # return the user named `name` (creating it if necessary)
36
- # if `name` is an alias, return the original user
37
- def self.get(name)
38
- user = named(name).first || create(:name => name)
39
- user = user.alias while user.alias
40
- return user
41
- end
42
-
43
-
44
- # mark the two names as pointing to the same user
45
- def self.alias!(name1, name2)
46
- u1 = get(name1)
47
- u2 = get(name2)
48
- $stderr.write "Warning: user '#{user.name}' has an alias\n" if u1.alias
49
- $stderr.write "Warning: user '#{user.name}' has an alias\n" if u2.alias
50
-
51
- # nothing to do if both names already point to the same user
52
- return if u1 == u2
53
-
54
- u2.aliases! u1
55
- end
56
-
57
-
58
- # duplication can occur during parallel imports because we're not running transactionally.
59
- def self.fix_duplicates!
60
- duplicates.map(&:name).each do |name|
61
- named(name).all.tap do |all_users|
62
- kept_user = all_users.pop
63
- all_users.each do |user|
64
- user.aliases! kept_user
65
- user.destroy
66
- end
67
- end
68
- end
69
- end
70
-
71
-
72
- # detect alias chains
73
- def self.resolve_alias_chains!
74
- find(:all, :joins => :alias, :conditions => 'aliases_users.alias_id IS NOT NULL').each do |user|
75
- user = find(user.id)
76
- origin = find(user.alias_id)
77
- origin = origin.alias while origin.alias # go up the chain
78
- $stderr.write "Aliasing #{user.name} -> #{origin.name}\n"
79
- user.aliases!(origin)
80
- end
81
- end
82
- end
83
- end