km-db 0.2.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +7 -0
- data/.ruby-version +1 -0
- data/Gemfile +2 -4
- data/Gemfile.lock +179 -20
- data/Procfile +2 -0
- data/Procfile.work +1 -0
- data/README.md +186 -0
- data/Rakefile +1 -0
- data/bin/kmdb-flush +13 -0
- data/bin/kmdb-import +13 -0
- data/bin/kmdb-partition +15 -0
- data/bin/kmdb-pool +8 -0
- data/bin/kmdb-realias +12 -0
- data/bin/kmdb-ui +6 -0
- data/bin/kmdb-work +17 -0
- data/config/amazon-rds-ca-cert.pem +260 -0
- data/config.ru +8 -0
- data/km-db.gemspec +17 -17
- data/lib/kmdb/{belongs_to_user.rb → concerns/belongs_to_user.rb} +3 -3
- data/lib/kmdb/concerns/has_properties.rb +35 -0
- data/lib/kmdb/jobs/find_files.rb +32 -0
- data/lib/kmdb/jobs/list_files.rb +37 -0
- data/lib/kmdb/jobs/locked.rb +10 -0
- data/lib/kmdb/jobs/parse_file.rb +109 -0
- data/lib/kmdb/jobs/record_batch.rb +65 -0
- data/lib/kmdb/jobs/redo_unaliasing.rb +31 -0
- data/lib/kmdb/jobs/unalias_user.rb +32 -0
- data/lib/kmdb/migrations/01_kmdb_initial.rb +78 -0
- data/lib/kmdb/migrations/02_kmdb_partitions.rb +28 -0
- data/lib/kmdb/migrations/03_kmdb_blacklist.rb +20 -0
- data/lib/kmdb/models/alias.rb +36 -0
- data/lib/kmdb/models/blacklisted_property.rb +20 -0
- data/lib/kmdb/models/custom_record.rb +53 -0
- data/lib/kmdb/models/dumpfile.rb +33 -0
- data/lib/kmdb/models/event.rb +56 -0
- data/lib/kmdb/models/event_batch.rb +72 -0
- data/lib/kmdb/models/global_uid.rb +42 -0
- data/lib/kmdb/models/ignored_user.rb +20 -0
- data/lib/kmdb/models/json_file.rb +56 -0
- data/lib/kmdb/models/key.rb +28 -0
- data/lib/kmdb/models/property.rb +44 -0
- data/lib/kmdb/models/s3_object.rb +54 -0
- data/lib/kmdb/models/user.rb +53 -0
- data/lib/kmdb/models/whitelisted_event.rb +20 -0
- data/lib/kmdb/parser.rb +4 -4
- data/lib/kmdb/redis.rb +17 -0
- data/lib/kmdb/resque.rb +38 -0
- data/lib/kmdb/s3_bucket.rb +33 -0
- data/lib/kmdb/services/partitioner.rb +65 -0
- data/lib/kmdb/version.rb +1 -1
- data/lib/kmdb.rb +31 -6
- metadata +236 -186
- data/README.markdown +0 -91
- data/bin/km_db_import +0 -36
- data/lib/kmdb/custom_record.rb +0 -54
- data/lib/kmdb/dumpfile.rb +0 -23
- data/lib/kmdb/event.rb +0 -39
- data/lib/kmdb/has_properties.rb +0 -33
- data/lib/kmdb/key.rb +0 -56
- data/lib/kmdb/migration.rb +0 -63
- data/lib/kmdb/parallel_parser.rb +0 -85
- data/lib/kmdb/property.rb +0 -33
- data/lib/kmdb/user.rb +0 -83
data/lib/kmdb/event.rb
DELETED
@@ -1,39 +0,0 @@
|
|
1
|
-
require 'kmdb/custom_record'
|
2
|
-
require 'kmdb/belongs_to_user'
|
3
|
-
require 'kmdb/has_properties'
|
4
|
-
|
5
|
-
module KMDB
|
6
|
-
class Event < CustomRecord
|
7
|
-
include BelongsToUser
|
8
|
-
include HasProperties
|
9
|
-
|
10
|
-
set_table_name "events"
|
11
|
-
|
12
|
-
named_scope :before, lambda { |date| { :conditions => ["`#{table_name}`.`t` < ?", date] } }
|
13
|
-
named_scope :after, lambda { |date| { :conditions => ["`#{table_name}`.`t` > ?", date] } }
|
14
|
-
|
15
|
-
named_scope :named, lambda { |name| { :conditions => { :n => KMDB::Key.get(name) } } }
|
16
|
-
|
17
|
-
named_scope :by_date, lambda { { :order => "`#{table_name}`.`t` ASC" } }
|
18
|
-
|
19
|
-
# return value of property
|
20
|
-
def prop(name)
|
21
|
-
properties.named(name).first.andand.value
|
22
|
-
end
|
23
|
-
|
24
|
-
def name
|
25
|
-
KMDB::Key.find(n).value
|
26
|
-
end
|
27
|
-
|
28
|
-
def self.record(hash)
|
29
|
-
user_name = hash.delete('_p')
|
30
|
-
user ||= User.get(user_name)
|
31
|
-
raise UserError.new "User missing for '#{user_name}'" unless user.present?
|
32
|
-
|
33
|
-
stamp = Time.at hash.delete('_t')
|
34
|
-
key = Key.get hash.delete('_n')
|
35
|
-
event = create(:t => stamp, :n => key, :user => user)
|
36
|
-
Property.set(hash, stamp, user, event)
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
data/lib/kmdb/has_properties.rb
DELETED
@@ -1,33 +0,0 @@
|
|
1
|
-
=begin
|
2
|
-
|
3
|
-
KMDB::HasProperties --
|
4
|
-
|
5
|
-
Trait shared by Event and User.
|
6
|
-
|
7
|
-
=end
|
8
|
-
|
9
|
-
module KMDB
|
10
|
-
module HasProperties
|
11
|
-
def self.included(mod)
|
12
|
-
mod.class_eval do
|
13
|
-
has_many :properties, :class_name => 'KMDB::Property'
|
14
|
-
|
15
|
-
named_scope :with_properties, lambda { |*props|
|
16
|
-
direction = props.delete(:exclude_missing) ? 'INNER' : 'LEFT'
|
17
|
-
prop_table = Property.table_name
|
18
|
-
selects = ["`#{table_name}`.*"]
|
19
|
-
joins = []
|
20
|
-
props.each_with_index { |prop,k|
|
21
|
-
temp_name = "#{prop_table}_#{k}"
|
22
|
-
selects << "`#{temp_name}`.`value` AS `#{prop.split.join('_')}`"
|
23
|
-
joins << sanitize_sql_array([%Q{
|
24
|
-
#{direction} JOIN `properties` AS `#{temp_name}`
|
25
|
-
ON `#{table_name}`.id = `#{temp_name}`.event_id
|
26
|
-
AND `#{temp_name}`.`key` = ?}, KMDB::Key.get(prop)])
|
27
|
-
}
|
28
|
-
{ :select => selects.join(', '), :joins => joins.join("\n") }
|
29
|
-
}
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
data/lib/kmdb/key.rb
DELETED
@@ -1,56 +0,0 @@
|
|
1
|
-
=begin
|
2
|
-
|
3
|
-
Map strings (event and property names) to unique integers (Key#id) for performance
|
4
|
-
|
5
|
-
=end
|
6
|
-
|
7
|
-
require 'kmdb/custom_record'
|
8
|
-
|
9
|
-
module KMDB
|
10
|
-
class Key < CustomRecord
|
11
|
-
set_table_name "keys"
|
12
|
-
|
13
|
-
has_many :events, :foreign_key => :n, :class_name => 'KMDB::Event', :dependent => :delete_all
|
14
|
-
has_many :properties, :foreign_key => :key, :class_name => 'KMDB::Property', :dependent => :delete_all
|
15
|
-
|
16
|
-
named_scope :has_duplicate, lambda {
|
17
|
-
{
|
18
|
-
:select => "id, string, COUNT(id) AS quantity",
|
19
|
-
:group => :string, :having => "quantity > 1"
|
20
|
-
}
|
21
|
-
}
|
22
|
-
|
23
|
-
def self.get(string)
|
24
|
-
@cache ||= {}
|
25
|
-
@cache[string] ||= get_uncached(string)
|
26
|
-
end
|
27
|
-
|
28
|
-
# Replace each duplicate key ID with its most-used variant
|
29
|
-
def self.fix_duplicates!
|
30
|
-
has_duplicate.map(&:string).each do |string|
|
31
|
-
all_keys = find(:all, :conditions => { :string => string })
|
32
|
-
|
33
|
-
# sort keys by usage
|
34
|
-
all_ids = all_keys.map { |key|
|
35
|
-
[key.id, Event.named(key.id).count + Property.named(key.id).count]
|
36
|
-
}.sort { |k1,k2|
|
37
|
-
k1.second <=> k2.second
|
38
|
-
}.map { |k|
|
39
|
-
k.first
|
40
|
-
}
|
41
|
-
id_to_keep = all_ids.pop
|
42
|
-
$stderr.write "Fixing key '#{string}' #{all_ids.inspect} -> #{id_to_keep.inspect}\n"
|
43
|
-
Event.update_all({ :n => id_to_keep }, ["`events`.`n` IN (?)", all_ids])
|
44
|
-
Property.update_all({ :key => id_to_keep }, ["`properties`.`key` IN (?)", all_ids])
|
45
|
-
Key.delete_all(["id IN (?)", all_ids])
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
private
|
50
|
-
|
51
|
-
def self.get_uncached(string)
|
52
|
-
string.size <= MaxStringSize or raise "String is too long"
|
53
|
-
find_or_create(:string => string).id
|
54
|
-
end
|
55
|
-
end
|
56
|
-
end
|
data/lib/kmdb/migration.rb
DELETED
@@ -1,63 +0,0 @@
|
|
1
|
-
=begin
|
2
|
-
|
3
|
-
Setup a custom database for KissMetrics tracking events.
|
4
|
-
|
5
|
-
=end
|
6
|
-
|
7
|
-
require 'active_record'
|
8
|
-
|
9
|
-
module KMDB
|
10
|
-
class SetupEventsDatabase < ActiveRecord::Migration
|
11
|
-
def self.connection
|
12
|
-
CustomRecord.connection
|
13
|
-
end
|
14
|
-
|
15
|
-
def self.up
|
16
|
-
create_table :events do |t|
|
17
|
-
t.integer :user_id
|
18
|
-
t.integer :n
|
19
|
-
t.datetime :t
|
20
|
-
end
|
21
|
-
add_index :events, [:n]
|
22
|
-
add_index :events, [:user_id]
|
23
|
-
|
24
|
-
|
25
|
-
create_table :keys do |t|
|
26
|
-
t.string :string, :limit => MaxStringSize
|
27
|
-
end
|
28
|
-
add_index :keys, [:string]
|
29
|
-
|
30
|
-
create_table :properties do |t|
|
31
|
-
t.integer :user_id
|
32
|
-
t.integer :event_id
|
33
|
-
t.integer :key
|
34
|
-
t.string :value, :limit => 64
|
35
|
-
t.datetime :t
|
36
|
-
end
|
37
|
-
add_index :properties, [:key]
|
38
|
-
add_index :properties, [:user_id]
|
39
|
-
add_index :properties, [:event_id]
|
40
|
-
|
41
|
-
create_table :users do |t|
|
42
|
-
t.string :name, :limit => 48
|
43
|
-
t.integer :alias_id
|
44
|
-
end
|
45
|
-
add_index :users, [:name]
|
46
|
-
|
47
|
-
create_table :dumpfiles do |t|
|
48
|
-
t.string :path
|
49
|
-
t.string :job
|
50
|
-
t.integer :offset
|
51
|
-
end
|
52
|
-
add_index :dumpfiles, [:path]
|
53
|
-
|
54
|
-
end
|
55
|
-
|
56
|
-
def self.down
|
57
|
-
drop_table :events
|
58
|
-
drop_table :properties
|
59
|
-
drop_table :users
|
60
|
-
drop_table :aliases
|
61
|
-
end
|
62
|
-
end
|
63
|
-
end
|
data/lib/kmdb/parallel_parser.rb
DELETED
@@ -1,85 +0,0 @@
|
|
1
|
-
require 'kmdb/parser'
|
2
|
-
require 'parallel'
|
3
|
-
|
4
|
-
module KMDB
|
5
|
-
class ParallelParser < Parser
|
6
|
-
|
7
|
-
def initialize(options = {})
|
8
|
-
super(options)
|
9
|
-
@worker_count = options.delete(:workers) || Parallel.processor_count
|
10
|
-
end
|
11
|
-
|
12
|
-
def run(argv)
|
13
|
-
@pipe_rd, @pipe_wr = IO.pipe
|
14
|
-
|
15
|
-
inputs = list_files_in(argv)
|
16
|
-
total_bytes = total_size_of_files(inputs)
|
17
|
-
log "total bytes : #{total_bytes}"
|
18
|
-
total_bytes -= inputs.map { |p| Dumpfile.get(p, @resume_job) }.compact.map(&:offset).sum
|
19
|
-
log "left to process : #{total_bytes}"
|
20
|
-
|
21
|
-
# Start workers
|
22
|
-
log "Using #{@worker_count} workers."
|
23
|
-
Process.fork do
|
24
|
-
@pipe_rd.close
|
25
|
-
Parallel.each(inputs, :in_processes => @worker_count) do |input|
|
26
|
-
KMDB::Event.connection.reconnect!
|
27
|
-
log "Worker #{Process.pid} starting #{input}"
|
28
|
-
$0 = "worker: #{input}"
|
29
|
-
process_events_in_file(input)
|
30
|
-
log "Worker #{Process.pid} done"
|
31
|
-
true
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
# Start gatherer
|
36
|
-
$0 = "gatherer: #{$0}"
|
37
|
-
@pipe_wr.close
|
38
|
-
byte_counter = 0
|
39
|
-
log "Starting gatherer, total bytes: #{total_bytes}"
|
40
|
-
progress = ProgressBar.new("-" * 20, total_bytes)
|
41
|
-
while line = @pipe_rd.gets
|
42
|
-
if line =~ /^OK (\d+)$/
|
43
|
-
byte_counter += $1.to_i
|
44
|
-
progress.set byte_counter
|
45
|
-
elsif line =~ /^FILE (.*)$/
|
46
|
-
progress.title = $1
|
47
|
-
else
|
48
|
-
log "Unparsed line: '#{line}'"
|
49
|
-
end
|
50
|
-
end
|
51
|
-
progress.finish
|
52
|
-
log "Total bytes processed: #{byte_counter}"
|
53
|
-
Process.waitall
|
54
|
-
end
|
55
|
-
|
56
|
-
private
|
57
|
-
|
58
|
-
def process_events_in_file(pathname)
|
59
|
-
pathname.open do |input|
|
60
|
-
processed_bytes = 0
|
61
|
-
if @resume_job
|
62
|
-
dumpfile = Dumpfile.get(pathname, @resume_job)
|
63
|
-
log "Starting file #{pathname} from offset #{dumpfile.offset}"
|
64
|
-
input.seek(dumpfile.offset)
|
65
|
-
end
|
66
|
-
line_number = 0
|
67
|
-
@pipe_wr.write "FILE #{pathname.basename}\n"
|
68
|
-
while line = input.gets
|
69
|
-
line_number += 1
|
70
|
-
processed_bytes += line.size
|
71
|
-
|
72
|
-
process_event(line)
|
73
|
-
dumpfile.set(input.tell)
|
74
|
-
|
75
|
-
if processed_bytes > 100_000
|
76
|
-
@pipe_wr.write "OK #{processed_bytes}\n"
|
77
|
-
processed_bytes = 0
|
78
|
-
end
|
79
|
-
end
|
80
|
-
@pipe_wr.write "OK #{processed_bytes}\n"
|
81
|
-
end
|
82
|
-
end
|
83
|
-
|
84
|
-
end
|
85
|
-
end
|
data/lib/kmdb/property.rb
DELETED
@@ -1,33 +0,0 @@
|
|
1
|
-
require 'kmdb/belongs_to_user'
|
2
|
-
|
3
|
-
module KMDB
|
4
|
-
class Property < CustomRecord
|
5
|
-
include BelongsToUser
|
6
|
-
|
7
|
-
set_table_name "properties"
|
8
|
-
belongs_to :event, :class_name => 'KMDB::Event'
|
9
|
-
|
10
|
-
default_scope :order => 't DESC'
|
11
|
-
named_scope :named, lambda { |name| { :conditions => { :key => KMDB::Key.get(name) } } }
|
12
|
-
|
13
|
-
def self.set(hash, stamp=nil, user=nil, event=nil)
|
14
|
-
user_name = hash.delete('_p')
|
15
|
-
user ||= User.get(user_name)
|
16
|
-
raise UserError.new "User missing for '#{user_name}'" unless user.present?
|
17
|
-
|
18
|
-
event_id = event ? event.id : nil
|
19
|
-
stamp = Time.at hash.delete('_t') || stamp
|
20
|
-
|
21
|
-
return if hash.empty?
|
22
|
-
sql_insert = "INSERT INTO `#{table_name}` (`t`,`user_id`,`event_id`,`key`,`value`) VALUES "
|
23
|
-
sql_values = []
|
24
|
-
|
25
|
-
hash.each_pair do |prop_name,value|
|
26
|
-
key = Key.get(prop_name)
|
27
|
-
sql_values << sanitize_sql_array(["(?,?,?,?,?)", stamp,user.id,event_id,key,value])
|
28
|
-
end
|
29
|
-
|
30
|
-
connection.execute(sql_insert + sql_values.join(","))
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
data/lib/kmdb/user.rb
DELETED
@@ -1,83 +0,0 @@
|
|
1
|
-
require 'kmdb/has_properties'
|
2
|
-
|
3
|
-
module KMDB
|
4
|
-
class User < CustomRecord
|
5
|
-
include HasProperties
|
6
|
-
|
7
|
-
set_table_name "users"
|
8
|
-
|
9
|
-
has_many :events, :class_name => 'KMDB::Event'
|
10
|
-
belongs_to :alias, :class_name => 'KMDB::User'
|
11
|
-
# points to the aliased user. if set, no properties/events should belong to this user
|
12
|
-
|
13
|
-
validates_presence_of :name
|
14
|
-
validates_uniqueness_of :name
|
15
|
-
|
16
|
-
named_scope :named, lambda { |name| { :conditions => { :name => name } } }
|
17
|
-
|
18
|
-
named_scope :duplicates, lambda {{
|
19
|
-
:select => "id, COUNT(id) AS quantity", :group => :name, :having => "quantity > 1"
|
20
|
-
}}
|
21
|
-
|
22
|
-
# return (latest) value of property
|
23
|
-
def prop(name)
|
24
|
-
properties.named(name).first.andand.value
|
25
|
-
end
|
26
|
-
|
27
|
-
# mark this user as aliasing another
|
28
|
-
def aliases!(other)
|
29
|
-
[Property,Event].each do |model|
|
30
|
-
model.user_is(self).update_all({:user_id => other.id})
|
31
|
-
end
|
32
|
-
self.update_attributes!(:alias => other)
|
33
|
-
end
|
34
|
-
|
35
|
-
# return the user named `name` (creating it if necessary)
|
36
|
-
# if `name` is an alias, return the original user
|
37
|
-
def self.get(name)
|
38
|
-
user = named(name).first || create(:name => name)
|
39
|
-
user = user.alias while user.alias
|
40
|
-
return user
|
41
|
-
end
|
42
|
-
|
43
|
-
|
44
|
-
# mark the two names as pointing to the same user
|
45
|
-
def self.alias!(name1, name2)
|
46
|
-
u1 = get(name1)
|
47
|
-
u2 = get(name2)
|
48
|
-
$stderr.write "Warning: user '#{user.name}' has an alias\n" if u1.alias
|
49
|
-
$stderr.write "Warning: user '#{user.name}' has an alias\n" if u2.alias
|
50
|
-
|
51
|
-
# nothing to do if both names already point to the same user
|
52
|
-
return if u1 == u2
|
53
|
-
|
54
|
-
u2.aliases! u1
|
55
|
-
end
|
56
|
-
|
57
|
-
|
58
|
-
# duplication can occur during parallel imports because we're not running transactionally.
|
59
|
-
def self.fix_duplicates!
|
60
|
-
duplicates.map(&:name).each do |name|
|
61
|
-
named(name).all.tap do |all_users|
|
62
|
-
kept_user = all_users.pop
|
63
|
-
all_users.each do |user|
|
64
|
-
user.aliases! kept_user
|
65
|
-
user.destroy
|
66
|
-
end
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
|
-
|
72
|
-
# detect alias chains
|
73
|
-
def self.resolve_alias_chains!
|
74
|
-
find(:all, :joins => :alias, :conditions => 'aliases_users.alias_id IS NOT NULL').each do |user|
|
75
|
-
user = find(user.id)
|
76
|
-
origin = find(user.alias_id)
|
77
|
-
origin = origin.alias while origin.alias # go up the chain
|
78
|
-
$stderr.write "Aliasing #{user.name} -> #{origin.name}\n"
|
79
|
-
user.aliases!(origin)
|
80
|
-
end
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|