km-db 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +6 -0
- data/Gemfile.lock +46 -0
- data/README.markdown +91 -0
- data/Rakefile +5 -0
- data/bin/km_db_import +36 -0
- data/km-db.gemspec +32 -0
- data/lib/kmdb/belongs_to_user.rb +15 -0
- data/lib/kmdb/custom_record.rb +54 -0
- data/lib/kmdb/dumpfile.rb +23 -0
- data/lib/kmdb/event.rb +39 -0
- data/lib/kmdb/has_properties.rb +33 -0
- data/lib/kmdb/key.rb +56 -0
- data/lib/kmdb/migration.rb +63 -0
- data/lib/kmdb/parallel_parser.rb +85 -0
- data/lib/kmdb/parser.rb +143 -0
- data/lib/kmdb/property.rb +33 -0
- data/lib/kmdb/user.rb +83 -0
- data/lib/kmdb/user_error.rb +2 -0
- data/lib/kmdb/version.rb +4 -0
- data/lib/kmdb.rb +10 -0
- metadata +234 -0
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
km-db (0.2.1)
|
5
|
+
activerecord (~> 2.3.12)
|
6
|
+
andand
|
7
|
+
parallel
|
8
|
+
progressbar
|
9
|
+
yajl-ruby
|
10
|
+
|
11
|
+
GEM
|
12
|
+
remote: http://rubygems.org/
|
13
|
+
specs:
|
14
|
+
activerecord (2.3.18)
|
15
|
+
activesupport (= 2.3.18)
|
16
|
+
activesupport (2.3.18)
|
17
|
+
andand (1.3.3)
|
18
|
+
diff-lcs (1.1.3)
|
19
|
+
json (1.7.7)
|
20
|
+
parallel (0.6.3)
|
21
|
+
progressbar (0.20.0)
|
22
|
+
rake (10.0.3)
|
23
|
+
rspec (2.4.0)
|
24
|
+
rspec-core (~> 2.4.0)
|
25
|
+
rspec-expectations (~> 2.4.0)
|
26
|
+
rspec-mocks (~> 2.4.0)
|
27
|
+
rspec-core (2.4.0)
|
28
|
+
rspec-expectations (2.4.0)
|
29
|
+
diff-lcs (~> 1.1.2)
|
30
|
+
rspec-mocks (2.4.0)
|
31
|
+
sqlite3 (1.3.7)
|
32
|
+
sqlite3-ruby (1.3.3)
|
33
|
+
sqlite3 (>= 1.3.3)
|
34
|
+
yajl-ruby (1.1.0)
|
35
|
+
|
36
|
+
PLATFORMS
|
37
|
+
ruby
|
38
|
+
|
39
|
+
DEPENDENCIES
|
40
|
+
bundler (>= 1.0.0)
|
41
|
+
json
|
42
|
+
km-db!
|
43
|
+
progressbar
|
44
|
+
rake
|
45
|
+
rspec (~> 2.4.0)
|
46
|
+
sqlite3-ruby
|
data/README.markdown
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
The `km-db` gem should be useful to KissMetrics (KM) users.
|
2
|
+
Its aim is to efficiently process data obtained with KM's "Data Export" feature.
|
3
|
+
|
4
|
+
It is meant to :
|
5
|
+
|
6
|
+
* import KM event dumps into a SQL database (preferably MySQL / PostgreSQL)
|
7
|
+
* quickly process KM event dumps
|
8
|
+
|
9
|
+
Once imported, you can run complex queries against your visit history, for instance run multivariate analysis.
|
10
|
+
|
11
|
+
Beware though, KM data can be huge, and processing it is taxing !
|
12
|
+
|
13
|
+
|
14
|
+
Installing
|
15
|
+
----------
|
16
|
+
|
17
|
+
Add this to your Gemfile if you're using Bundler:
|
18
|
+
|
19
|
+
gem 'km-db', :git => 'git://github.com/HouseTrip/km-db.git'
|
20
|
+
|
21
|
+
|
22
|
+
Importing data
|
23
|
+
--------------
|
24
|
+
|
25
|
+
Running reports on raw logs can be less effective than running against a (relational) database.
|
26
|
+
`km-db` provides a `km_db_import` executable. Run it with:
|
27
|
+
|
28
|
+
$ bundle exec km_db_import <data-dump-directory>…
|
29
|
+
|
30
|
+
By default, you events will be imported in `test.db`, a SQLite database.
|
31
|
+
|
32
|
+
You can create `km_db.yml` or `config/km_db.yml` to have it import using another adapter, for instance:
|
33
|
+
|
34
|
+
---- km_db.yml ----
|
35
|
+
adapter: mysql2
|
36
|
+
database: km_events
|
37
|
+
user: root
|
38
|
+
|
39
|
+
Remember to add `sqlite3-ruby` or `mysql2` to your Gemfile.
|
40
|
+
|
41
|
+
|
42
|
+
Using imported data
|
43
|
+
-------------------
|
44
|
+
|
45
|
+
The `KMDB` module exposes four `ActiveRecord` classes:
|
46
|
+
`Event`, `Property`, `User` are the main domain objects.
|
47
|
+
`Key` is used to intern strings (event and property names) for performance.
|
48
|
+
|
49
|
+
### Finding events and properties
|
50
|
+
|
51
|
+
All visits during Jan. 2012:
|
52
|
+
|
53
|
+
KMDB::Event.before('2012-02-1').after('2012-01-01').named('visited site').by_date
|
54
|
+
|
55
|
+
All of a user's visit:
|
56
|
+
|
57
|
+
KMDB::User.last.events.named('visited site')
|
58
|
+
|
59
|
+
A user's referers:
|
60
|
+
|
61
|
+
KMDB::User.last.properties.named('referer').map(&:value)
|
62
|
+
|
63
|
+
Load some properties with events (uses a left join by default):
|
64
|
+
|
65
|
+
KMDB::User.last.events.with_properties('a prop', 'another prop').map(&:another_prop)
|
66
|
+
|
67
|
+
Note that many more complex queries will require building SQL queries directly.
|
68
|
+
|
69
|
+
|
70
|
+
Processing data
|
71
|
+
---------------
|
72
|
+
|
73
|
+
You don't have to import to filter your data.
|
74
|
+
|
75
|
+
The two classes you're looking for are `KMDB::Parser` and `KMDB::ParallelParser`.
|
76
|
+
The latter runs your filter task on all available CPUs, using the `parallel` gem.
|
77
|
+
|
78
|
+
The following example counts the number of *aliasing* events in all JSON files under `dumps/`:
|
79
|
+
|
80
|
+
require 'rubygems'
|
81
|
+
require 'kmdb'
|
82
|
+
|
83
|
+
counter = 0
|
84
|
+
parser = KMDB::Parser.new
|
85
|
+
parser.add_filter do |text,event|
|
86
|
+
counter += 1 if event['_p2']
|
87
|
+
end
|
88
|
+
parser.run('dumps/')
|
89
|
+
puts counter
|
90
|
+
|
91
|
+
Note that it will not work with `ParallelParser`, as the `counter` variable will be different for each process.
|
data/Rakefile
ADDED
data/bin/km_db_import
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
=begin
|
3
|
+
|
4
|
+
Import KM events from the raw dumps.
|
5
|
+
|
6
|
+
=end
|
7
|
+
|
8
|
+
require 'rubygems'
|
9
|
+
require 'kmdb'
|
10
|
+
|
11
|
+
if KMDB::Event.connection.class.to_s =~ /(mysql|pgsql)/i
|
12
|
+
parser_class = KMDB::ParallelParser
|
13
|
+
else
|
14
|
+
parser_class = KMDB::Parser
|
15
|
+
end
|
16
|
+
|
17
|
+
parser = parser_class.new(:resume => 'import',
|
18
|
+
:verbose => true)
|
19
|
+
|
20
|
+
# import events by category
|
21
|
+
parser.add_filter { |text, event|
|
22
|
+
if event['_p2']
|
23
|
+
KMDB::User.alias! event['_p'], event['_p2']
|
24
|
+
elsif event['_n']
|
25
|
+
KMDB::Event.record event
|
26
|
+
else
|
27
|
+
KMDB::Property.set event
|
28
|
+
end
|
29
|
+
|
30
|
+
event
|
31
|
+
}
|
32
|
+
|
33
|
+
parser.run(ARGV) # heavy lifting here
|
34
|
+
KMDB::Event.connection.reconnect! # reconnect to database (breaks because of processes forking off)
|
35
|
+
KMDB::User.resolve_alias_chains! # detect and filter alias chains
|
36
|
+
KMDB::Key.fix_duplicates! # remove key duplicates
|
data/km-db.gemspec
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path("../lib/kmdb/version", __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "km-db"
|
6
|
+
s.version = KMDB::VERSION
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.authors = ["HouseTrip"]
|
9
|
+
s.email = ["jtl@housetrip.com"]
|
10
|
+
s.homepage = "https://github.com/housetrip/km-db"
|
11
|
+
s.summary = "Process KISSmetrics data dumps"
|
12
|
+
s.description = "Process KISSmetrics data dumps"
|
13
|
+
|
14
|
+
s.required_rubygems_version = ">= 1.3.6"
|
15
|
+
|
16
|
+
s.add_development_dependency "bundler", ">= 1.0.0"
|
17
|
+
s.add_development_dependency "rspec", "~> 2.4.0"
|
18
|
+
s.add_development_dependency "rake"
|
19
|
+
s.add_development_dependency "json"
|
20
|
+
s.add_development_dependency "sqlite3-ruby"
|
21
|
+
|
22
|
+
s.add_dependency "yajl-ruby"
|
23
|
+
s.add_dependency "progressbar"
|
24
|
+
s.add_dependency "parallel"
|
25
|
+
s.add_dependency "andand"
|
26
|
+
s.add_dependency "activerecord", "~> 2.3.12"
|
27
|
+
|
28
|
+
s.files = `git ls-files`.split("\n")
|
29
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
30
|
+
s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
|
31
|
+
s.require_path = 'lib'
|
32
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module KMDB
|
2
|
+
module BelongsToUser
|
3
|
+
def self.included(mod)
|
4
|
+
mod.class_eval do
|
5
|
+
belongs_to :user, :class_name => 'KMDB::User'
|
6
|
+
validates_presence_of :user
|
7
|
+
|
8
|
+
named_scope :user_is, lambda { |user|
|
9
|
+
user.kind_of?(User) or raise TypeError.new("Not a kind of User")
|
10
|
+
{ :conditions => { :user_id => user.id } }
|
11
|
+
}
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
=begin
|
2
|
+
|
3
|
+
Base class for KM data.
|
4
|
+
Connect to a secondary database to store events, users, & properties.
|
5
|
+
|
6
|
+
FIXME: the database connection is hard-coded for now.
|
7
|
+
|
8
|
+
=end
|
9
|
+
|
10
|
+
require 'active_record'
|
11
|
+
require 'erb'
|
12
|
+
require 'yaml'
|
13
|
+
require 'kmdb/migration'
|
14
|
+
|
15
|
+
|
16
|
+
module KMDB
|
17
|
+
class CustomRecord < ActiveRecord::Base
|
18
|
+
DefaultConfig = {
|
19
|
+
'adapter' => 'sqlite3',
|
20
|
+
'database' => "test.db"
|
21
|
+
}
|
22
|
+
|
23
|
+
def self.disable_index
|
24
|
+
connection.execute %Q{
|
25
|
+
ALTER TABLE `#{table_name}` DISABLE KEYS;
|
26
|
+
}
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.enable_index
|
30
|
+
connection.execute %Q{
|
31
|
+
ALTER TABLE `#{table_name}` ENABLE KEYS;
|
32
|
+
}
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.find_or_create(options)
|
36
|
+
find(:first, :conditions => options) || create(options)
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.connect_to_km_db!
|
40
|
+
config = DefaultConfig.dup
|
41
|
+
['km_db.yml', 'config/km_db.yml'].each do |config_path|
|
42
|
+
next unless File.exist?(config_path)
|
43
|
+
config.merge! YAML.load(ERB.new(File.open(config_path).read).result)
|
44
|
+
break
|
45
|
+
end
|
46
|
+
establish_connection(config)
|
47
|
+
|
48
|
+
unless connection.table_exists?('events')
|
49
|
+
SetupEventsDatabase.up
|
50
|
+
self.reset_column_information
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'kmdb/custom_record'
|
2
|
+
|
3
|
+
module KMDB
|
4
|
+
class Dumpfile < CustomRecord
|
5
|
+
set_table_name "dumpfiles"
|
6
|
+
|
7
|
+
validates_presence_of :offset
|
8
|
+
validates_presence_of :path
|
9
|
+
|
10
|
+
def set(offset)
|
11
|
+
update_attributes!(:offset => offset)
|
12
|
+
end
|
13
|
+
|
14
|
+
def offset
|
15
|
+
attributes['offset'] || 0
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.get(pathname, job = nil)
|
19
|
+
job ||= 'nil'
|
20
|
+
find_or_create(:path => pathname.cleanpath.to_s, :job => job)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/kmdb/event.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'kmdb/custom_record'
|
2
|
+
require 'kmdb/belongs_to_user'
|
3
|
+
require 'kmdb/has_properties'
|
4
|
+
|
5
|
+
module KMDB
|
6
|
+
class Event < CustomRecord
|
7
|
+
include BelongsToUser
|
8
|
+
include HasProperties
|
9
|
+
|
10
|
+
set_table_name "events"
|
11
|
+
|
12
|
+
named_scope :before, lambda { |date| { :conditions => ["`#{table_name}`.`t` < ?", date] } }
|
13
|
+
named_scope :after, lambda { |date| { :conditions => ["`#{table_name}`.`t` > ?", date] } }
|
14
|
+
|
15
|
+
named_scope :named, lambda { |name| { :conditions => { :n => KMDB::Key.get(name) } } }
|
16
|
+
|
17
|
+
named_scope :by_date, lambda { { :order => "`#{table_name}`.`t` ASC" } }
|
18
|
+
|
19
|
+
# return value of property
|
20
|
+
def prop(name)
|
21
|
+
properties.named(name).first.andand.value
|
22
|
+
end
|
23
|
+
|
24
|
+
def name
|
25
|
+
KMDB::Key.find(n).value
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.record(hash)
|
29
|
+
user_name = hash.delete('_p')
|
30
|
+
user ||= User.get(user_name)
|
31
|
+
raise UserError.new "User missing for '#{user_name}'" unless user.present?
|
32
|
+
|
33
|
+
stamp = Time.at hash.delete('_t')
|
34
|
+
key = Key.get hash.delete('_n')
|
35
|
+
event = create(:t => stamp, :n => key, :user => user)
|
36
|
+
Property.set(hash, stamp, user, event)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
=begin
|
2
|
+
|
3
|
+
KMDB::HasProperties --
|
4
|
+
|
5
|
+
Trait shared by Event and User.
|
6
|
+
|
7
|
+
=end
|
8
|
+
|
9
|
+
module KMDB
|
10
|
+
module HasProperties
|
11
|
+
def self.included(mod)
|
12
|
+
mod.class_eval do
|
13
|
+
has_many :properties, :class_name => 'KMDB::Property'
|
14
|
+
|
15
|
+
named_scope :with_properties, lambda { |*props|
|
16
|
+
direction = props.delete(:exclude_missing) ? 'INNER' : 'LEFT'
|
17
|
+
prop_table = Property.table_name
|
18
|
+
selects = ["`#{table_name}`.*"]
|
19
|
+
joins = []
|
20
|
+
props.each_with_index { |prop,k|
|
21
|
+
temp_name = "#{prop_table}_#{k}"
|
22
|
+
selects << "`#{temp_name}`.`value` AS `#{prop.split.join('_')}`"
|
23
|
+
joins << sanitize_sql_array([%Q{
|
24
|
+
#{direction} JOIN `properties` AS `#{temp_name}`
|
25
|
+
ON `#{table_name}`.id = `#{temp_name}`.event_id
|
26
|
+
AND `#{temp_name}`.`key` = ?}, KMDB::Key.get(prop)])
|
27
|
+
}
|
28
|
+
{ :select => selects.join(', '), :joins => joins.join("\n") }
|
29
|
+
}
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/kmdb/key.rb
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
=begin
|
2
|
+
|
3
|
+
Map strings (event and property names) to unique integers (Key#id) for performance
|
4
|
+
|
5
|
+
=end
|
6
|
+
|
7
|
+
require 'kmdb/custom_record'
|
8
|
+
|
9
|
+
module KMDB
|
10
|
+
class Key < CustomRecord
|
11
|
+
set_table_name "keys"
|
12
|
+
|
13
|
+
has_many :events, :foreign_key => :n, :class_name => 'KMDB::Event', :dependent => :delete_all
|
14
|
+
has_many :properties, :foreign_key => :key, :class_name => 'KMDB::Property', :dependent => :delete_all
|
15
|
+
|
16
|
+
named_scope :has_duplicate, lambda {
|
17
|
+
{
|
18
|
+
:select => "id, string, COUNT(id) AS quantity",
|
19
|
+
:group => :string, :having => "quantity > 1"
|
20
|
+
}
|
21
|
+
}
|
22
|
+
|
23
|
+
def self.get(string)
|
24
|
+
@cache ||= {}
|
25
|
+
@cache[string] ||= get_uncached(string)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Replace each duplicate key ID with its most-used variant
|
29
|
+
def self.fix_duplicates!
|
30
|
+
has_duplicate.map(&:string).each do |string|
|
31
|
+
all_keys = find(:all, :conditions => { :string => string })
|
32
|
+
|
33
|
+
# sort keys by usage
|
34
|
+
all_ids = all_keys.map { |key|
|
35
|
+
[key.id, Event.named(key.id).count + Property.named(key.id).count]
|
36
|
+
}.sort { |k1,k2|
|
37
|
+
k1.second <=> k2.second
|
38
|
+
}.map { |k|
|
39
|
+
k.first
|
40
|
+
}
|
41
|
+
id_to_keep = all_ids.pop
|
42
|
+
$stderr.write "Fixing key '#{string}' #{all_ids.inspect} -> #{id_to_keep.inspect}\n"
|
43
|
+
Event.update_all({ :n => id_to_keep }, ["`events`.`n` IN (?)", all_ids])
|
44
|
+
Property.update_all({ :key => id_to_keep }, ["`properties`.`key` IN (?)", all_ids])
|
45
|
+
Key.delete_all(["id IN (?)", all_ids])
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def self.get_uncached(string)
|
52
|
+
string.size <= MaxStringSize or raise "String is too long"
|
53
|
+
find_or_create(:string => string).id
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
=begin
|
2
|
+
|
3
|
+
Setup a custom database for KissMetrics tracking events.
|
4
|
+
|
5
|
+
=end
|
6
|
+
|
7
|
+
require 'active_record'
|
8
|
+
|
9
|
+
module KMDB
|
10
|
+
class SetupEventsDatabase < ActiveRecord::Migration
|
11
|
+
def self.connection
|
12
|
+
CustomRecord.connection
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.up
|
16
|
+
create_table :events do |t|
|
17
|
+
t.integer :user_id
|
18
|
+
t.integer :n
|
19
|
+
t.datetime :t
|
20
|
+
end
|
21
|
+
add_index :events, [:n]
|
22
|
+
add_index :events, [:user_id]
|
23
|
+
|
24
|
+
|
25
|
+
create_table :keys do |t|
|
26
|
+
t.string :string, :limit => MaxStringSize
|
27
|
+
end
|
28
|
+
add_index :keys, [:string]
|
29
|
+
|
30
|
+
create_table :properties do |t|
|
31
|
+
t.integer :user_id
|
32
|
+
t.integer :event_id
|
33
|
+
t.integer :key
|
34
|
+
t.string :value, :limit => 64
|
35
|
+
t.datetime :t
|
36
|
+
end
|
37
|
+
add_index :properties, [:key]
|
38
|
+
add_index :properties, [:user_id]
|
39
|
+
add_index :properties, [:event_id]
|
40
|
+
|
41
|
+
create_table :users do |t|
|
42
|
+
t.string :name, :limit => 48
|
43
|
+
t.integer :alias_id
|
44
|
+
end
|
45
|
+
add_index :users, [:name]
|
46
|
+
|
47
|
+
create_table :dumpfiles do |t|
|
48
|
+
t.string :path
|
49
|
+
t.string :job
|
50
|
+
t.integer :offset
|
51
|
+
end
|
52
|
+
add_index :dumpfiles, [:path]
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.down
|
57
|
+
drop_table :events
|
58
|
+
drop_table :properties
|
59
|
+
drop_table :users
|
60
|
+
drop_table :aliases
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
require 'kmdb/parser'
|
2
|
+
require 'parallel'
|
3
|
+
|
4
|
+
module KMDB
|
5
|
+
class ParallelParser < Parser
|
6
|
+
|
7
|
+
def initialize(options = {})
|
8
|
+
super(options)
|
9
|
+
@worker_count = options.delete(:workers) || Parallel.processor_count
|
10
|
+
end
|
11
|
+
|
12
|
+
def run(argv)
|
13
|
+
@pipe_rd, @pipe_wr = IO.pipe
|
14
|
+
|
15
|
+
inputs = list_files_in(argv)
|
16
|
+
total_bytes = total_size_of_files(inputs)
|
17
|
+
log "total bytes : #{total_bytes}"
|
18
|
+
total_bytes -= inputs.map { |p| Dumpfile.get(p, @resume_job) }.compact.map(&:offset).sum
|
19
|
+
log "left to process : #{total_bytes}"
|
20
|
+
|
21
|
+
# Start workers
|
22
|
+
log "Using #{@worker_count} workers."
|
23
|
+
Process.fork do
|
24
|
+
@pipe_rd.close
|
25
|
+
Parallel.each(inputs, :in_processes => @worker_count) do |input|
|
26
|
+
KMDB::Event.connection.reconnect!
|
27
|
+
log "Worker #{Process.pid} starting #{input}"
|
28
|
+
$0 = "worker: #{input}"
|
29
|
+
process_events_in_file(input)
|
30
|
+
log "Worker #{Process.pid} done"
|
31
|
+
true
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Start gatherer
|
36
|
+
$0 = "gatherer: #{$0}"
|
37
|
+
@pipe_wr.close
|
38
|
+
byte_counter = 0
|
39
|
+
log "Starting gatherer, total bytes: #{total_bytes}"
|
40
|
+
progress = ProgressBar.new("-" * 20, total_bytes)
|
41
|
+
while line = @pipe_rd.gets
|
42
|
+
if line =~ /^OK (\d+)$/
|
43
|
+
byte_counter += $1.to_i
|
44
|
+
progress.set byte_counter
|
45
|
+
elsif line =~ /^FILE (.*)$/
|
46
|
+
progress.title = $1
|
47
|
+
else
|
48
|
+
log "Unparsed line: '#{line}'"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
progress.finish
|
52
|
+
log "Total bytes processed: #{byte_counter}"
|
53
|
+
Process.waitall
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def process_events_in_file(pathname)
|
59
|
+
pathname.open do |input|
|
60
|
+
processed_bytes = 0
|
61
|
+
if @resume_job
|
62
|
+
dumpfile = Dumpfile.get(pathname, @resume_job)
|
63
|
+
log "Starting file #{pathname} from offset #{dumpfile.offset}"
|
64
|
+
input.seek(dumpfile.offset)
|
65
|
+
end
|
66
|
+
line_number = 0
|
67
|
+
@pipe_wr.write "FILE #{pathname.basename}\n"
|
68
|
+
while line = input.gets
|
69
|
+
line_number += 1
|
70
|
+
processed_bytes += line.size
|
71
|
+
|
72
|
+
process_event(line)
|
73
|
+
dumpfile.set(input.tell)
|
74
|
+
|
75
|
+
if processed_bytes > 100_000
|
76
|
+
@pipe_wr.write "OK #{processed_bytes}\n"
|
77
|
+
processed_bytes = 0
|
78
|
+
end
|
79
|
+
end
|
80
|
+
@pipe_wr.write "OK #{processed_bytes}\n"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
data/lib/kmdb/parser.rb
ADDED
@@ -0,0 +1,143 @@
|
|
1
|
+
require 'yajl/json_gem'
|
2
|
+
require 'pathname'
|
3
|
+
require 'progressbar'
|
4
|
+
require 'pstore'
|
5
|
+
|
6
|
+
module KMDB
|
7
|
+
class Parser
|
8
|
+
class ProgressBar < ::ProgressBar
|
9
|
+
attr_writer :title
|
10
|
+
end
|
11
|
+
|
12
|
+
attr :resume_job
|
13
|
+
attr :verbose
|
14
|
+
attr :abort_on_error
|
15
|
+
|
16
|
+
def initialize(options = {})
|
17
|
+
@processed_bytes = nil
|
18
|
+
@total_bytes = nil
|
19
|
+
@exclude_regexps = []
|
20
|
+
@include_regexps = []
|
21
|
+
@filters = []
|
22
|
+
@verbose = options.delete(:verbose)
|
23
|
+
@resume_job = options.delete(:resume)
|
24
|
+
@abort_on_error = options.delete(:abort_on_error)
|
25
|
+
|
26
|
+
if @resume_job && @verbose && Dumpfile.count > 0
|
27
|
+
log "Using restart information"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def exclude(regexp)
|
32
|
+
@exclude_regexps << regexp
|
33
|
+
self
|
34
|
+
end
|
35
|
+
|
36
|
+
def only(regexp)
|
37
|
+
@include_regexps << regexp
|
38
|
+
self
|
39
|
+
end
|
40
|
+
|
41
|
+
def add_filter(&block)
|
42
|
+
@filters << block
|
43
|
+
self
|
44
|
+
end
|
45
|
+
|
46
|
+
def run(argv)
|
47
|
+
inputs = list_files_in(argv)
|
48
|
+
total_bytes = total_size_of_files(inputs)
|
49
|
+
log "total bytes : #{total_bytes}"
|
50
|
+
total_bytes -= inputs.map { |p| Dumpfile.get(p, @resume_job) }.compact.map(&:offset).sum
|
51
|
+
log "left to process : #{total_bytes}"
|
52
|
+
|
53
|
+
@processed_bytes = 0
|
54
|
+
@progress = ProgressBar.new("-" * 20, total_bytes)
|
55
|
+
@progress.long_running if @progress.respond_to?(:long_running)
|
56
|
+
|
57
|
+
inputs.sort.each do |input|
|
58
|
+
process_events_in_file(input)
|
59
|
+
end
|
60
|
+
|
61
|
+
@progress.finish
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
def log(message)
|
67
|
+
$stderr.write(message + "\n") if @verbose
|
68
|
+
end
|
69
|
+
|
70
|
+
def process_event(text)
|
71
|
+
return if @exclude_regexps.any? { |re| text =~ re }
|
72
|
+
return unless @include_regexps.all? { |re| text =~ re }
|
73
|
+
|
74
|
+
# filter strange utf-8 encoding/escaping found in KM dumps
|
75
|
+
if text =~ /\\30[3-5]\\[0-9]{3}/
|
76
|
+
begin
|
77
|
+
text = eval("%Q(#{text})")
|
78
|
+
rescue SyntaxError => e
|
79
|
+
log "Syntax error in: #{text}"
|
80
|
+
raise e if @abort_on_error
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
begin
|
85
|
+
data = JSON.parse(text)
|
86
|
+
rescue JSON::ParserError => e
|
87
|
+
log "Warning, JSON parse error in: #{text}"
|
88
|
+
raise e if @abort_on_error
|
89
|
+
return
|
90
|
+
end
|
91
|
+
|
92
|
+
if data.nil?
|
93
|
+
log "Warning, JSON parse failed in: #{text}"
|
94
|
+
return
|
95
|
+
end
|
96
|
+
|
97
|
+
@filters.each do |filter|
|
98
|
+
data = filter.call(text, data) or break
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def process_events_in_file(pathname)
|
103
|
+
pathname.open do |input|
|
104
|
+
@progress.title = pathname.basename.to_s
|
105
|
+
if @resume_job
|
106
|
+
dumpfile = Dumpfile.get(pathname, @resume_job)
|
107
|
+
log "Starting file #{pathname} from offset #{dumpfile.offset}"
|
108
|
+
input.seek(dumpfile.offset)
|
109
|
+
end
|
110
|
+
line_number = 0
|
111
|
+
while line = input.gets
|
112
|
+
@processed_bytes += line.size
|
113
|
+
@progress.set @processed_bytes if line_number % 100 == 0
|
114
|
+
line_number += 1
|
115
|
+
|
116
|
+
process_event(line)
|
117
|
+
dumpfile.set(input.tell) if @resume_job
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def total_size_of_files(inputs)
|
123
|
+
inputs.map { |c| c.stat.size }.inject(0) { |a,b| a+b }
|
124
|
+
end
|
125
|
+
|
126
|
+
def list_files_in_directory(directory)
|
127
|
+
input_fns = []
|
128
|
+
directory.find do |input_pn|
|
129
|
+
input_pn.to_s =~ /\.json$/ or next
|
130
|
+
input_fns << input_pn
|
131
|
+
end
|
132
|
+
input_fns.sort
|
133
|
+
end
|
134
|
+
|
135
|
+
def list_files_in(argv)
|
136
|
+
argv.map { |arg| Pathname.new(arg) }.map { |pn|
|
137
|
+
pn.exist? and pn or raise "No such file or directory '#{pn}'"
|
138
|
+
}.map { |pn|
|
139
|
+
pn.directory? ? list_files_in_directory(pn) : pn
|
140
|
+
}.flatten
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'kmdb/belongs_to_user'
|
2
|
+
|
3
|
+
module KMDB
|
4
|
+
class Property < CustomRecord
|
5
|
+
include BelongsToUser
|
6
|
+
|
7
|
+
set_table_name "properties"
|
8
|
+
belongs_to :event, :class_name => 'KMDB::Event'
|
9
|
+
|
10
|
+
default_scope :order => 't DESC'
|
11
|
+
named_scope :named, lambda { |name| { :conditions => { :key => KMDB::Key.get(name) } } }
|
12
|
+
|
13
|
+
def self.set(hash, stamp=nil, user=nil, event=nil)
|
14
|
+
user_name = hash.delete('_p')
|
15
|
+
user ||= User.get(user_name)
|
16
|
+
raise UserError.new "User missing for '#{user_name}'" unless user.present?
|
17
|
+
|
18
|
+
event_id = event ? event.id : nil
|
19
|
+
stamp = Time.at hash.delete('_t') || stamp
|
20
|
+
|
21
|
+
return if hash.empty?
|
22
|
+
sql_insert = "INSERT INTO `#{table_name}` (`t`,`user_id`,`event_id`,`key`,`value`) VALUES "
|
23
|
+
sql_values = []
|
24
|
+
|
25
|
+
hash.each_pair do |prop_name,value|
|
26
|
+
key = Key.get(prop_name)
|
27
|
+
sql_values << sanitize_sql_array(["(?,?,?,?,?)", stamp,user.id,event_id,key,value])
|
28
|
+
end
|
29
|
+
|
30
|
+
connection.execute(sql_insert + sql_values.join(","))
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/kmdb/user.rb
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'kmdb/has_properties'
|
2
|
+
|
3
|
+
module KMDB
|
4
|
+
class User < CustomRecord
|
5
|
+
include HasProperties
|
6
|
+
|
7
|
+
set_table_name "users"
|
8
|
+
|
9
|
+
has_many :events, :class_name => 'KMDB::Event'
|
10
|
+
belongs_to :alias, :class_name => 'KMDB::User'
|
11
|
+
# points to the aliased user. if set, no properties/events should belong to this user
|
12
|
+
|
13
|
+
validates_presence_of :name
|
14
|
+
validates_uniqueness_of :name
|
15
|
+
|
16
|
+
named_scope :named, lambda { |name| { :conditions => { :name => name } } }
|
17
|
+
|
18
|
+
named_scope :duplicates, lambda {{
|
19
|
+
:select => "id, COUNT(id) AS quantity", :group => :name, :having => "quantity > 1"
|
20
|
+
}}
|
21
|
+
|
22
|
+
# return (latest) value of property
|
23
|
+
def prop(name)
|
24
|
+
properties.named(name).first.andand.value
|
25
|
+
end
|
26
|
+
|
27
|
+
# mark this user as aliasing another
|
28
|
+
def aliases!(other)
|
29
|
+
[Property,Event].each do |model|
|
30
|
+
model.user_is(self).update_all({:user_id => other.id})
|
31
|
+
end
|
32
|
+
self.update_attributes!(:alias => other)
|
33
|
+
end
|
34
|
+
|
35
|
+
# return the user named `name` (creating it if necessary)
|
36
|
+
# if `name` is an alias, return the original user
|
37
|
+
def self.get(name)
|
38
|
+
user = named(name).first || create(:name => name)
|
39
|
+
user = user.alias while user.alias
|
40
|
+
return user
|
41
|
+
end
|
42
|
+
|
43
|
+
|
44
|
+
# mark the two names as pointing to the same user
|
45
|
+
def self.alias!(name1, name2)
|
46
|
+
u1 = get(name1)
|
47
|
+
u2 = get(name2)
|
48
|
+
$stderr.write "Warning: user '#{user.name}' has an alias\n" if u1.alias
|
49
|
+
$stderr.write "Warning: user '#{user.name}' has an alias\n" if u2.alias
|
50
|
+
|
51
|
+
# nothing to do if both names already point to the same user
|
52
|
+
return if u1 == u2
|
53
|
+
|
54
|
+
u2.aliases! u1
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
# duplication can occur during parallel imports because we're not running transactionally.
|
59
|
+
def self.fix_duplicates!
|
60
|
+
duplicates.map(&:name).each do |name|
|
61
|
+
named(name).all.tap do |all_users|
|
62
|
+
kept_user = all_users.pop
|
63
|
+
all_users.each do |user|
|
64
|
+
user.aliases! kept_user
|
65
|
+
user.destroy
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
# detect alias chains
|
73
|
+
def self.resolve_alias_chains!
|
74
|
+
find(:all, :joins => :alias, :conditions => 'aliases_users.alias_id IS NOT NULL').each do |user|
|
75
|
+
user = find(user.id)
|
76
|
+
origin = find(user.alias_id)
|
77
|
+
origin = origin.alias while origin.alias # go up the chain
|
78
|
+
$stderr.write "Aliasing #{user.name} -> #{origin.name}\n"
|
79
|
+
user.aliases!(origin)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
data/lib/kmdb/version.rb
ADDED
data/lib/kmdb.rb
ADDED
metadata
ADDED
@@ -0,0 +1,234 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: km-db
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 21
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 2
|
9
|
+
- 1
|
10
|
+
version: 0.2.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- HouseTrip
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2013-03-23 00:00:00 +00:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
23
|
+
none: false
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
hash: 23
|
28
|
+
segments:
|
29
|
+
- 1
|
30
|
+
- 0
|
31
|
+
- 0
|
32
|
+
version: 1.0.0
|
33
|
+
prerelease: false
|
34
|
+
name: bundler
|
35
|
+
type: :development
|
36
|
+
version_requirements: *id001
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ~>
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
hash: 31
|
44
|
+
segments:
|
45
|
+
- 2
|
46
|
+
- 4
|
47
|
+
- 0
|
48
|
+
version: 2.4.0
|
49
|
+
prerelease: false
|
50
|
+
name: rspec
|
51
|
+
type: :development
|
52
|
+
version_requirements: *id002
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
hash: 3
|
60
|
+
segments:
|
61
|
+
- 0
|
62
|
+
version: "0"
|
63
|
+
prerelease: false
|
64
|
+
name: rake
|
65
|
+
type: :development
|
66
|
+
version_requirements: *id003
|
67
|
+
- !ruby/object:Gem::Dependency
|
68
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
69
|
+
none: false
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
hash: 3
|
74
|
+
segments:
|
75
|
+
- 0
|
76
|
+
version: "0"
|
77
|
+
prerelease: false
|
78
|
+
name: json
|
79
|
+
type: :development
|
80
|
+
version_requirements: *id004
|
81
|
+
- !ruby/object:Gem::Dependency
|
82
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
84
|
+
requirements:
|
85
|
+
- - ">="
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
hash: 3
|
88
|
+
segments:
|
89
|
+
- 0
|
90
|
+
version: "0"
|
91
|
+
prerelease: false
|
92
|
+
name: sqlite3-ruby
|
93
|
+
type: :development
|
94
|
+
version_requirements: *id005
|
95
|
+
- !ruby/object:Gem::Dependency
|
96
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ">="
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
hash: 3
|
102
|
+
segments:
|
103
|
+
- 0
|
104
|
+
version: "0"
|
105
|
+
prerelease: false
|
106
|
+
name: yajl-ruby
|
107
|
+
type: :runtime
|
108
|
+
version_requirements: *id006
|
109
|
+
- !ruby/object:Gem::Dependency
|
110
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
111
|
+
none: false
|
112
|
+
requirements:
|
113
|
+
- - ">="
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
hash: 3
|
116
|
+
segments:
|
117
|
+
- 0
|
118
|
+
version: "0"
|
119
|
+
prerelease: false
|
120
|
+
name: progressbar
|
121
|
+
type: :runtime
|
122
|
+
version_requirements: *id007
|
123
|
+
- !ruby/object:Gem::Dependency
|
124
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
125
|
+
none: false
|
126
|
+
requirements:
|
127
|
+
- - ">="
|
128
|
+
- !ruby/object:Gem::Version
|
129
|
+
hash: 3
|
130
|
+
segments:
|
131
|
+
- 0
|
132
|
+
version: "0"
|
133
|
+
prerelease: false
|
134
|
+
name: parallel
|
135
|
+
type: :runtime
|
136
|
+
version_requirements: *id008
|
137
|
+
- !ruby/object:Gem::Dependency
|
138
|
+
requirement: &id009 !ruby/object:Gem::Requirement
|
139
|
+
none: false
|
140
|
+
requirements:
|
141
|
+
- - ">="
|
142
|
+
- !ruby/object:Gem::Version
|
143
|
+
hash: 3
|
144
|
+
segments:
|
145
|
+
- 0
|
146
|
+
version: "0"
|
147
|
+
prerelease: false
|
148
|
+
name: andand
|
149
|
+
type: :runtime
|
150
|
+
version_requirements: *id009
|
151
|
+
- !ruby/object:Gem::Dependency
|
152
|
+
requirement: &id010 !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
154
|
+
requirements:
|
155
|
+
- - ~>
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
hash: 27
|
158
|
+
segments:
|
159
|
+
- 2
|
160
|
+
- 3
|
161
|
+
- 12
|
162
|
+
version: 2.3.12
|
163
|
+
prerelease: false
|
164
|
+
name: activerecord
|
165
|
+
type: :runtime
|
166
|
+
version_requirements: *id010
|
167
|
+
description: Process KISSmetrics data dumps
|
168
|
+
email:
|
169
|
+
- jtl@housetrip.com
|
170
|
+
executables:
|
171
|
+
- km_db_import
|
172
|
+
extensions: []
|
173
|
+
|
174
|
+
extra_rdoc_files: []
|
175
|
+
|
176
|
+
files:
|
177
|
+
- Gemfile
|
178
|
+
- Gemfile.lock
|
179
|
+
- README.markdown
|
180
|
+
- Rakefile
|
181
|
+
- bin/km_db_import
|
182
|
+
- km-db.gemspec
|
183
|
+
- lib/kmdb.rb
|
184
|
+
- lib/kmdb/belongs_to_user.rb
|
185
|
+
- lib/kmdb/custom_record.rb
|
186
|
+
- lib/kmdb/dumpfile.rb
|
187
|
+
- lib/kmdb/event.rb
|
188
|
+
- lib/kmdb/has_properties.rb
|
189
|
+
- lib/kmdb/key.rb
|
190
|
+
- lib/kmdb/migration.rb
|
191
|
+
- lib/kmdb/parallel_parser.rb
|
192
|
+
- lib/kmdb/parser.rb
|
193
|
+
- lib/kmdb/property.rb
|
194
|
+
- lib/kmdb/user.rb
|
195
|
+
- lib/kmdb/user_error.rb
|
196
|
+
- lib/kmdb/version.rb
|
197
|
+
has_rdoc: true
|
198
|
+
homepage: https://github.com/housetrip/km-db
|
199
|
+
licenses: []
|
200
|
+
|
201
|
+
post_install_message:
|
202
|
+
rdoc_options: []
|
203
|
+
|
204
|
+
require_paths:
|
205
|
+
- lib
|
206
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
207
|
+
none: false
|
208
|
+
requirements:
|
209
|
+
- - ">="
|
210
|
+
- !ruby/object:Gem::Version
|
211
|
+
hash: 3
|
212
|
+
segments:
|
213
|
+
- 0
|
214
|
+
version: "0"
|
215
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
216
|
+
none: false
|
217
|
+
requirements:
|
218
|
+
- - ">="
|
219
|
+
- !ruby/object:Gem::Version
|
220
|
+
hash: 23
|
221
|
+
segments:
|
222
|
+
- 1
|
223
|
+
- 3
|
224
|
+
- 6
|
225
|
+
version: 1.3.6
|
226
|
+
requirements: []
|
227
|
+
|
228
|
+
rubyforge_project:
|
229
|
+
rubygems_version: 1.3.9.5
|
230
|
+
signing_key:
|
231
|
+
specification_version: 3
|
232
|
+
summary: Process KISSmetrics data dumps
|
233
|
+
test_files: []
|
234
|
+
|