km-db 0.2.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (64) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +7 -0
  3. data/.ruby-version +1 -0
  4. data/Gemfile +2 -4
  5. data/Gemfile.lock +179 -20
  6. data/Procfile +2 -0
  7. data/Procfile.work +1 -0
  8. data/README.md +186 -0
  9. data/Rakefile +1 -0
  10. data/bin/kmdb-flush +13 -0
  11. data/bin/kmdb-import +13 -0
  12. data/bin/kmdb-partition +15 -0
  13. data/bin/kmdb-pool +8 -0
  14. data/bin/kmdb-realias +12 -0
  15. data/bin/kmdb-ui +6 -0
  16. data/bin/kmdb-work +17 -0
  17. data/config/amazon-rds-ca-cert.pem +260 -0
  18. data/config.ru +8 -0
  19. data/km-db.gemspec +17 -17
  20. data/lib/kmdb/{belongs_to_user.rb → concerns/belongs_to_user.rb} +3 -3
  21. data/lib/kmdb/concerns/has_properties.rb +35 -0
  22. data/lib/kmdb/jobs/find_files.rb +32 -0
  23. data/lib/kmdb/jobs/list_files.rb +37 -0
  24. data/lib/kmdb/jobs/locked.rb +10 -0
  25. data/lib/kmdb/jobs/parse_file.rb +109 -0
  26. data/lib/kmdb/jobs/record_batch.rb +65 -0
  27. data/lib/kmdb/jobs/redo_unaliasing.rb +31 -0
  28. data/lib/kmdb/jobs/unalias_user.rb +32 -0
  29. data/lib/kmdb/migrations/01_kmdb_initial.rb +78 -0
  30. data/lib/kmdb/migrations/02_kmdb_partitions.rb +28 -0
  31. data/lib/kmdb/migrations/03_kmdb_blacklist.rb +20 -0
  32. data/lib/kmdb/models/alias.rb +36 -0
  33. data/lib/kmdb/models/blacklisted_property.rb +20 -0
  34. data/lib/kmdb/models/custom_record.rb +53 -0
  35. data/lib/kmdb/models/dumpfile.rb +33 -0
  36. data/lib/kmdb/models/event.rb +56 -0
  37. data/lib/kmdb/models/event_batch.rb +72 -0
  38. data/lib/kmdb/models/global_uid.rb +42 -0
  39. data/lib/kmdb/models/ignored_user.rb +20 -0
  40. data/lib/kmdb/models/json_file.rb +56 -0
  41. data/lib/kmdb/models/key.rb +28 -0
  42. data/lib/kmdb/models/property.rb +44 -0
  43. data/lib/kmdb/models/s3_object.rb +54 -0
  44. data/lib/kmdb/models/user.rb +53 -0
  45. data/lib/kmdb/models/whitelisted_event.rb +20 -0
  46. data/lib/kmdb/parser.rb +4 -4
  47. data/lib/kmdb/redis.rb +17 -0
  48. data/lib/kmdb/resque.rb +38 -0
  49. data/lib/kmdb/s3_bucket.rb +33 -0
  50. data/lib/kmdb/services/partitioner.rb +65 -0
  51. data/lib/kmdb/version.rb +1 -1
  52. data/lib/kmdb.rb +31 -6
  53. metadata +236 -186
  54. data/README.markdown +0 -91
  55. data/bin/km_db_import +0 -36
  56. data/lib/kmdb/custom_record.rb +0 -54
  57. data/lib/kmdb/dumpfile.rb +0 -23
  58. data/lib/kmdb/event.rb +0 -39
  59. data/lib/kmdb/has_properties.rb +0 -33
  60. data/lib/kmdb/key.rb +0 -56
  61. data/lib/kmdb/migration.rb +0 -63
  62. data/lib/kmdb/parallel_parser.rb +0 -85
  63. data/lib/kmdb/property.rb +0 -33
  64. data/lib/kmdb/user.rb +0 -83
data/lib/kmdb/event.rb DELETED
@@ -1,39 +0,0 @@
1
- require 'kmdb/custom_record'
2
- require 'kmdb/belongs_to_user'
3
- require 'kmdb/has_properties'
4
-
5
- module KMDB
6
- class Event < CustomRecord
7
- include BelongsToUser
8
- include HasProperties
9
-
10
- set_table_name "events"
11
-
12
- named_scope :before, lambda { |date| { :conditions => ["`#{table_name}`.`t` < ?", date] } }
13
- named_scope :after, lambda { |date| { :conditions => ["`#{table_name}`.`t` > ?", date] } }
14
-
15
- named_scope :named, lambda { |name| { :conditions => { :n => KMDB::Key.get(name) } } }
16
-
17
- named_scope :by_date, lambda { { :order => "`#{table_name}`.`t` ASC" } }
18
-
19
- # return value of property
20
- def prop(name)
21
- properties.named(name).first.andand.value
22
- end
23
-
24
- def name
25
- KMDB::Key.find(n).value
26
- end
27
-
28
- def self.record(hash)
29
- user_name = hash.delete('_p')
30
- user ||= User.get(user_name)
31
- raise UserError.new "User missing for '#{user_name}'" unless user.present?
32
-
33
- stamp = Time.at hash.delete('_t')
34
- key = Key.get hash.delete('_n')
35
- event = create(:t => stamp, :n => key, :user => user)
36
- Property.set(hash, stamp, user, event)
37
- end
38
- end
39
- end
@@ -1,33 +0,0 @@
1
- =begin
2
-
3
- KMDB::HasProperties --
4
-
5
- Trait shared by Event and User.
6
-
7
- =end
8
-
9
- module KMDB
10
- module HasProperties
11
- def self.included(mod)
12
- mod.class_eval do
13
- has_many :properties, :class_name => 'KMDB::Property'
14
-
15
- named_scope :with_properties, lambda { |*props|
16
- direction = props.delete(:exclude_missing) ? 'INNER' : 'LEFT'
17
- prop_table = Property.table_name
18
- selects = ["`#{table_name}`.*"]
19
- joins = []
20
- props.each_with_index { |prop,k|
21
- temp_name = "#{prop_table}_#{k}"
22
- selects << "`#{temp_name}`.`value` AS `#{prop.split.join('_')}`"
23
- joins << sanitize_sql_array([%Q{
24
- #{direction} JOIN `properties` AS `#{temp_name}`
25
- ON `#{table_name}`.id = `#{temp_name}`.event_id
26
- AND `#{temp_name}`.`key` = ?}, KMDB::Key.get(prop)])
27
- }
28
- { :select => selects.join(', '), :joins => joins.join("\n") }
29
- }
30
- end
31
- end
32
- end
33
- end
data/lib/kmdb/key.rb DELETED
@@ -1,56 +0,0 @@
1
- =begin
2
-
3
- Map strings (event and property names) to unique integers (Key#id) for performance
4
-
5
- =end
6
-
7
- require 'kmdb/custom_record'
8
-
9
- module KMDB
10
- class Key < CustomRecord
11
- set_table_name "keys"
12
-
13
- has_many :events, :foreign_key => :n, :class_name => 'KMDB::Event', :dependent => :delete_all
14
- has_many :properties, :foreign_key => :key, :class_name => 'KMDB::Property', :dependent => :delete_all
15
-
16
- named_scope :has_duplicate, lambda {
17
- {
18
- :select => "id, string, COUNT(id) AS quantity",
19
- :group => :string, :having => "quantity > 1"
20
- }
21
- }
22
-
23
- def self.get(string)
24
- @cache ||= {}
25
- @cache[string] ||= get_uncached(string)
26
- end
27
-
28
- # Replace each duplicate key ID with its most-used variant
29
- def self.fix_duplicates!
30
- has_duplicate.map(&:string).each do |string|
31
- all_keys = find(:all, :conditions => { :string => string })
32
-
33
- # sort keys by usage
34
- all_ids = all_keys.map { |key|
35
- [key.id, Event.named(key.id).count + Property.named(key.id).count]
36
- }.sort { |k1,k2|
37
- k1.second <=> k2.second
38
- }.map { |k|
39
- k.first
40
- }
41
- id_to_keep = all_ids.pop
42
- $stderr.write "Fixing key '#{string}' #{all_ids.inspect} -> #{id_to_keep.inspect}\n"
43
- Event.update_all({ :n => id_to_keep }, ["`events`.`n` IN (?)", all_ids])
44
- Property.update_all({ :key => id_to_keep }, ["`properties`.`key` IN (?)", all_ids])
45
- Key.delete_all(["id IN (?)", all_ids])
46
- end
47
- end
48
-
49
- private
50
-
51
- def self.get_uncached(string)
52
- string.size <= MaxStringSize or raise "String is too long"
53
- find_or_create(:string => string).id
54
- end
55
- end
56
- end
@@ -1,63 +0,0 @@
1
- =begin
2
-
3
- Setup a custom database for KissMetrics tracking events.
4
-
5
- =end
6
-
7
- require 'active_record'
8
-
9
- module KMDB
10
- class SetupEventsDatabase < ActiveRecord::Migration
11
- def self.connection
12
- CustomRecord.connection
13
- end
14
-
15
- def self.up
16
- create_table :events do |t|
17
- t.integer :user_id
18
- t.integer :n
19
- t.datetime :t
20
- end
21
- add_index :events, [:n]
22
- add_index :events, [:user_id]
23
-
24
-
25
- create_table :keys do |t|
26
- t.string :string, :limit => MaxStringSize
27
- end
28
- add_index :keys, [:string]
29
-
30
- create_table :properties do |t|
31
- t.integer :user_id
32
- t.integer :event_id
33
- t.integer :key
34
- t.string :value, :limit => 64
35
- t.datetime :t
36
- end
37
- add_index :properties, [:key]
38
- add_index :properties, [:user_id]
39
- add_index :properties, [:event_id]
40
-
41
- create_table :users do |t|
42
- t.string :name, :limit => 48
43
- t.integer :alias_id
44
- end
45
- add_index :users, [:name]
46
-
47
- create_table :dumpfiles do |t|
48
- t.string :path
49
- t.string :job
50
- t.integer :offset
51
- end
52
- add_index :dumpfiles, [:path]
53
-
54
- end
55
-
56
- def self.down
57
- drop_table :events
58
- drop_table :properties
59
- drop_table :users
60
- drop_table :aliases
61
- end
62
- end
63
- end
@@ -1,85 +0,0 @@
1
- require 'kmdb/parser'
2
- require 'parallel'
3
-
4
- module KMDB
5
- class ParallelParser < Parser
6
-
7
- def initialize(options = {})
8
- super(options)
9
- @worker_count = options.delete(:workers) || Parallel.processor_count
10
- end
11
-
12
- def run(argv)
13
- @pipe_rd, @pipe_wr = IO.pipe
14
-
15
- inputs = list_files_in(argv)
16
- total_bytes = total_size_of_files(inputs)
17
- log "total bytes : #{total_bytes}"
18
- total_bytes -= inputs.map { |p| Dumpfile.get(p, @resume_job) }.compact.map(&:offset).sum
19
- log "left to process : #{total_bytes}"
20
-
21
- # Start workers
22
- log "Using #{@worker_count} workers."
23
- Process.fork do
24
- @pipe_rd.close
25
- Parallel.each(inputs, :in_processes => @worker_count) do |input|
26
- KMDB::Event.connection.reconnect!
27
- log "Worker #{Process.pid} starting #{input}"
28
- $0 = "worker: #{input}"
29
- process_events_in_file(input)
30
- log "Worker #{Process.pid} done"
31
- true
32
- end
33
- end
34
-
35
- # Start gatherer
36
- $0 = "gatherer: #{$0}"
37
- @pipe_wr.close
38
- byte_counter = 0
39
- log "Starting gatherer, total bytes: #{total_bytes}"
40
- progress = ProgressBar.new("-" * 20, total_bytes)
41
- while line = @pipe_rd.gets
42
- if line =~ /^OK (\d+)$/
43
- byte_counter += $1.to_i
44
- progress.set byte_counter
45
- elsif line =~ /^FILE (.*)$/
46
- progress.title = $1
47
- else
48
- log "Unparsed line: '#{line}'"
49
- end
50
- end
51
- progress.finish
52
- log "Total bytes processed: #{byte_counter}"
53
- Process.waitall
54
- end
55
-
56
- private
57
-
58
- def process_events_in_file(pathname)
59
- pathname.open do |input|
60
- processed_bytes = 0
61
- if @resume_job
62
- dumpfile = Dumpfile.get(pathname, @resume_job)
63
- log "Starting file #{pathname} from offset #{dumpfile.offset}"
64
- input.seek(dumpfile.offset)
65
- end
66
- line_number = 0
67
- @pipe_wr.write "FILE #{pathname.basename}\n"
68
- while line = input.gets
69
- line_number += 1
70
- processed_bytes += line.size
71
-
72
- process_event(line)
73
- dumpfile.set(input.tell)
74
-
75
- if processed_bytes > 100_000
76
- @pipe_wr.write "OK #{processed_bytes}\n"
77
- processed_bytes = 0
78
- end
79
- end
80
- @pipe_wr.write "OK #{processed_bytes}\n"
81
- end
82
- end
83
-
84
- end
85
- end
data/lib/kmdb/property.rb DELETED
@@ -1,33 +0,0 @@
1
- require 'kmdb/belongs_to_user'
2
-
3
- module KMDB
4
- class Property < CustomRecord
5
- include BelongsToUser
6
-
7
- set_table_name "properties"
8
- belongs_to :event, :class_name => 'KMDB::Event'
9
-
10
- default_scope :order => 't DESC'
11
- named_scope :named, lambda { |name| { :conditions => { :key => KMDB::Key.get(name) } } }
12
-
13
- def self.set(hash, stamp=nil, user=nil, event=nil)
14
- user_name = hash.delete('_p')
15
- user ||= User.get(user_name)
16
- raise UserError.new "User missing for '#{user_name}'" unless user.present?
17
-
18
- event_id = event ? event.id : nil
19
- stamp = Time.at hash.delete('_t') || stamp
20
-
21
- return if hash.empty?
22
- sql_insert = "INSERT INTO `#{table_name}` (`t`,`user_id`,`event_id`,`key`,`value`) VALUES "
23
- sql_values = []
24
-
25
- hash.each_pair do |prop_name,value|
26
- key = Key.get(prop_name)
27
- sql_values << sanitize_sql_array(["(?,?,?,?,?)", stamp,user.id,event_id,key,value])
28
- end
29
-
30
- connection.execute(sql_insert + sql_values.join(","))
31
- end
32
- end
33
- end
data/lib/kmdb/user.rb DELETED
@@ -1,83 +0,0 @@
1
- require 'kmdb/has_properties'
2
-
3
- module KMDB
4
- class User < CustomRecord
5
- include HasProperties
6
-
7
- set_table_name "users"
8
-
9
- has_many :events, :class_name => 'KMDB::Event'
10
- belongs_to :alias, :class_name => 'KMDB::User'
11
- # points to the aliased user. if set, no properties/events should belong to this user
12
-
13
- validates_presence_of :name
14
- validates_uniqueness_of :name
15
-
16
- named_scope :named, lambda { |name| { :conditions => { :name => name } } }
17
-
18
- named_scope :duplicates, lambda {{
19
- :select => "id, COUNT(id) AS quantity", :group => :name, :having => "quantity > 1"
20
- }}
21
-
22
- # return (latest) value of property
23
- def prop(name)
24
- properties.named(name).first.andand.value
25
- end
26
-
27
- # mark this user as aliasing another
28
- def aliases!(other)
29
- [Property,Event].each do |model|
30
- model.user_is(self).update_all({:user_id => other.id})
31
- end
32
- self.update_attributes!(:alias => other)
33
- end
34
-
35
- # return the user named `name` (creating it if necessary)
36
- # if `name` is an alias, return the original user
37
- def self.get(name)
38
- user = named(name).first || create(:name => name)
39
- user = user.alias while user.alias
40
- return user
41
- end
42
-
43
-
44
- # mark the two names as pointing to the same user
45
- def self.alias!(name1, name2)
46
- u1 = get(name1)
47
- u2 = get(name2)
48
- $stderr.write "Warning: user '#{user.name}' has an alias\n" if u1.alias
49
- $stderr.write "Warning: user '#{user.name}' has an alias\n" if u2.alias
50
-
51
- # nothing to do if both names already point to the same user
52
- return if u1 == u2
53
-
54
- u2.aliases! u1
55
- end
56
-
57
-
58
- # duplication can occur during parallel imports because we're not running transactionally.
59
- def self.fix_duplicates!
60
- duplicates.map(&:name).each do |name|
61
- named(name).all.tap do |all_users|
62
- kept_user = all_users.pop
63
- all_users.each do |user|
64
- user.aliases! kept_user
65
- user.destroy
66
- end
67
- end
68
- end
69
- end
70
-
71
-
72
- # detect alias chains
73
- def self.resolve_alias_chains!
74
- find(:all, :joins => :alias, :conditions => 'aliases_users.alias_id IS NOT NULL').each do |user|
75
- user = find(user.id)
76
- origin = find(user.alias_id)
77
- origin = origin.alias while origin.alias # go up the chain
78
- $stderr.write "Aliasing #{user.name} -> #{origin.name}\n"
79
- user.aliases!(origin)
80
- end
81
- end
82
- end
83
- end