km-db 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +6 -0
- data/Gemfile.lock +46 -0
- data/README.markdown +91 -0
- data/Rakefile +5 -0
- data/bin/km_db_import +36 -0
- data/km-db.gemspec +32 -0
- data/lib/kmdb/belongs_to_user.rb +15 -0
- data/lib/kmdb/custom_record.rb +54 -0
- data/lib/kmdb/dumpfile.rb +23 -0
- data/lib/kmdb/event.rb +39 -0
- data/lib/kmdb/has_properties.rb +33 -0
- data/lib/kmdb/key.rb +56 -0
- data/lib/kmdb/migration.rb +63 -0
- data/lib/kmdb/parallel_parser.rb +85 -0
- data/lib/kmdb/parser.rb +143 -0
- data/lib/kmdb/property.rb +33 -0
- data/lib/kmdb/user.rb +83 -0
- data/lib/kmdb/user_error.rb +2 -0
- data/lib/kmdb/version.rb +4 -0
- data/lib/kmdb.rb +10 -0
- metadata +234 -0
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
km-db (0.2.1)
|
5
|
+
activerecord (~> 2.3.12)
|
6
|
+
andand
|
7
|
+
parallel
|
8
|
+
progressbar
|
9
|
+
yajl-ruby
|
10
|
+
|
11
|
+
GEM
|
12
|
+
remote: http://rubygems.org/
|
13
|
+
specs:
|
14
|
+
activerecord (2.3.18)
|
15
|
+
activesupport (= 2.3.18)
|
16
|
+
activesupport (2.3.18)
|
17
|
+
andand (1.3.3)
|
18
|
+
diff-lcs (1.1.3)
|
19
|
+
json (1.7.7)
|
20
|
+
parallel (0.6.3)
|
21
|
+
progressbar (0.20.0)
|
22
|
+
rake (10.0.3)
|
23
|
+
rspec (2.4.0)
|
24
|
+
rspec-core (~> 2.4.0)
|
25
|
+
rspec-expectations (~> 2.4.0)
|
26
|
+
rspec-mocks (~> 2.4.0)
|
27
|
+
rspec-core (2.4.0)
|
28
|
+
rspec-expectations (2.4.0)
|
29
|
+
diff-lcs (~> 1.1.2)
|
30
|
+
rspec-mocks (2.4.0)
|
31
|
+
sqlite3 (1.3.7)
|
32
|
+
sqlite3-ruby (1.3.3)
|
33
|
+
sqlite3 (>= 1.3.3)
|
34
|
+
yajl-ruby (1.1.0)
|
35
|
+
|
36
|
+
PLATFORMS
|
37
|
+
ruby
|
38
|
+
|
39
|
+
DEPENDENCIES
|
40
|
+
bundler (>= 1.0.0)
|
41
|
+
json
|
42
|
+
km-db!
|
43
|
+
progressbar
|
44
|
+
rake
|
45
|
+
rspec (~> 2.4.0)
|
46
|
+
sqlite3-ruby
|
data/README.markdown
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
The `km-db` gem should be useful to KissMetrics (KM) users.
|
2
|
+
Its aim is to efficiently process data obtained with KM's "Data Export" feature.
|
3
|
+
|
4
|
+
It is meant to :
|
5
|
+
|
6
|
+
* import KM event dumps into a SQL database (preferably MySQL / PostgreSQL)
|
7
|
+
* quickly process KM event dumps
|
8
|
+
|
9
|
+
Once imported, you can run complex queries against your visit history, for instance run multivariate analysis.
|
10
|
+
|
11
|
+
Beware though, KM data can be huge, and processing it is taxing !
|
12
|
+
|
13
|
+
|
14
|
+
Installing
|
15
|
+
----------
|
16
|
+
|
17
|
+
Add this to your Gemfile if you're using Bundler:
|
18
|
+
|
19
|
+
gem 'km-db', :git => 'git://github.com/HouseTrip/km-db.git'
|
20
|
+
|
21
|
+
|
22
|
+
Importing data
|
23
|
+
--------------
|
24
|
+
|
25
|
+
Running reports on raw logs can be less effective than running against a (relational) database.
|
26
|
+
`km-db` provides a `km_db_import` executable. Run it with:
|
27
|
+
|
28
|
+
$ bundle exec km_db_import <data-dump-directory>…
|
29
|
+
|
30
|
+
By default, you events will be imported in `test.db`, a SQLite database.
|
31
|
+
|
32
|
+
You can create `km_db.yml` or `config/km_db.yml` to have it import using another adapter, for instance:
|
33
|
+
|
34
|
+
---- km_db.yml ----
|
35
|
+
adapter: mysql2
|
36
|
+
database: km_events
|
37
|
+
user: root
|
38
|
+
|
39
|
+
Remember to add `sqlite3-ruby` or `mysql2` to your Gemfile.
|
40
|
+
|
41
|
+
|
42
|
+
Using imported data
|
43
|
+
-------------------
|
44
|
+
|
45
|
+
The `KMDB` module exposes four `ActiveRecord` classes:
|
46
|
+
`Event`, `Property`, `User` are the main domain objects.
|
47
|
+
`Key` is used to intern strings (event and property names) for performance.
|
48
|
+
|
49
|
+
### Finding events and properties
|
50
|
+
|
51
|
+
All visits during Jan. 2012:
|
52
|
+
|
53
|
+
KMDB::Event.before('2012-02-1').after('2012-01-01').named('visited site').by_date
|
54
|
+
|
55
|
+
All of a user's visit:
|
56
|
+
|
57
|
+
KMDB::User.last.events.named('visited site')
|
58
|
+
|
59
|
+
A user's referers:
|
60
|
+
|
61
|
+
KMDB::User.last.properties.named('referer').map(&:value)
|
62
|
+
|
63
|
+
Load some properties with events (uses a left join by default):
|
64
|
+
|
65
|
+
KMDB::User.last.events.with_properties('a prop', 'another prop').map(&:another_prop)
|
66
|
+
|
67
|
+
Note that many more complex queries will require building SQL queries directly.
|
68
|
+
|
69
|
+
|
70
|
+
Processing data
|
71
|
+
---------------
|
72
|
+
|
73
|
+
You don't have to import to filter your data.
|
74
|
+
|
75
|
+
The two classes you're looking for are `KMDB::Parser` and `KMDB::ParallelParser`.
|
76
|
+
The latter runs your filter task on all available CPUs, using the `parallel` gem.
|
77
|
+
|
78
|
+
The following example counts the number of *aliasing* events in all JSON files under `dumps/`:
|
79
|
+
|
80
|
+
require 'rubygems'
|
81
|
+
require 'kmdb'
|
82
|
+
|
83
|
+
counter = 0
|
84
|
+
parser = KMDB::Parser.new
|
85
|
+
parser.add_filter do |text,event|
|
86
|
+
counter += 1 if event['_p2']
|
87
|
+
end
|
88
|
+
parser.run('dumps/')
|
89
|
+
puts counter
|
90
|
+
|
91
|
+
Note that it will not work with `ParallelParser`, as the `counter` variable will be different for each process.
|
data/Rakefile
ADDED
data/bin/km_db_import
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
=begin
|
3
|
+
|
4
|
+
Import KM events from the raw dumps.
|
5
|
+
|
6
|
+
=end
|
7
|
+
|
8
|
+
require 'rubygems'
|
9
|
+
require 'kmdb'
|
10
|
+
|
11
|
+
if KMDB::Event.connection.class.to_s =~ /(mysql|pgsql)/i
|
12
|
+
parser_class = KMDB::ParallelParser
|
13
|
+
else
|
14
|
+
parser_class = KMDB::Parser
|
15
|
+
end
|
16
|
+
|
17
|
+
parser = parser_class.new(:resume => 'import',
|
18
|
+
:verbose => true)
|
19
|
+
|
20
|
+
# import events by category
|
21
|
+
parser.add_filter { |text, event|
|
22
|
+
if event['_p2']
|
23
|
+
KMDB::User.alias! event['_p'], event['_p2']
|
24
|
+
elsif event['_n']
|
25
|
+
KMDB::Event.record event
|
26
|
+
else
|
27
|
+
KMDB::Property.set event
|
28
|
+
end
|
29
|
+
|
30
|
+
event
|
31
|
+
}
|
32
|
+
|
33
|
+
parser.run(ARGV) # heavy lifting here
|
34
|
+
KMDB::Event.connection.reconnect! # reconnect to database (breaks because of processes forking off)
|
35
|
+
KMDB::User.resolve_alias_chains! # detect and filter alias chains
|
36
|
+
KMDB::Key.fix_duplicates! # remove key duplicates
|
data/km-db.gemspec
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path("../lib/kmdb/version", __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "km-db"
|
6
|
+
s.version = KMDB::VERSION
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.authors = ["HouseTrip"]
|
9
|
+
s.email = ["jtl@housetrip.com"]
|
10
|
+
s.homepage = "https://github.com/housetrip/km-db"
|
11
|
+
s.summary = "Process KISSmetrics data dumps"
|
12
|
+
s.description = "Process KISSmetrics data dumps"
|
13
|
+
|
14
|
+
s.required_rubygems_version = ">= 1.3.6"
|
15
|
+
|
16
|
+
s.add_development_dependency "bundler", ">= 1.0.0"
|
17
|
+
s.add_development_dependency "rspec", "~> 2.4.0"
|
18
|
+
s.add_development_dependency "rake"
|
19
|
+
s.add_development_dependency "json"
|
20
|
+
s.add_development_dependency "sqlite3-ruby"
|
21
|
+
|
22
|
+
s.add_dependency "yajl-ruby"
|
23
|
+
s.add_dependency "progressbar"
|
24
|
+
s.add_dependency "parallel"
|
25
|
+
s.add_dependency "andand"
|
26
|
+
s.add_dependency "activerecord", "~> 2.3.12"
|
27
|
+
|
28
|
+
s.files = `git ls-files`.split("\n")
|
29
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
30
|
+
s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
|
31
|
+
s.require_path = 'lib'
|
32
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module KMDB
|
2
|
+
module BelongsToUser
|
3
|
+
def self.included(mod)
|
4
|
+
mod.class_eval do
|
5
|
+
belongs_to :user, :class_name => 'KMDB::User'
|
6
|
+
validates_presence_of :user
|
7
|
+
|
8
|
+
named_scope :user_is, lambda { |user|
|
9
|
+
user.kind_of?(User) or raise TypeError.new("Not a kind of User")
|
10
|
+
{ :conditions => { :user_id => user.id } }
|
11
|
+
}
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
=begin
|
2
|
+
|
3
|
+
Base class for KM data.
|
4
|
+
Connect to a secondary database to store events, users, & properties.
|
5
|
+
|
6
|
+
FIXME: the database connection is hard-coded for now.
|
7
|
+
|
8
|
+
=end
|
9
|
+
|
10
|
+
require 'active_record'
|
11
|
+
require 'erb'
|
12
|
+
require 'yaml'
|
13
|
+
require 'kmdb/migration'
|
14
|
+
|
15
|
+
|
16
|
+
module KMDB
|
17
|
+
class CustomRecord < ActiveRecord::Base
|
18
|
+
DefaultConfig = {
|
19
|
+
'adapter' => 'sqlite3',
|
20
|
+
'database' => "test.db"
|
21
|
+
}
|
22
|
+
|
23
|
+
def self.disable_index
|
24
|
+
connection.execute %Q{
|
25
|
+
ALTER TABLE `#{table_name}` DISABLE KEYS;
|
26
|
+
}
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.enable_index
|
30
|
+
connection.execute %Q{
|
31
|
+
ALTER TABLE `#{table_name}` ENABLE KEYS;
|
32
|
+
}
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.find_or_create(options)
|
36
|
+
find(:first, :conditions => options) || create(options)
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.connect_to_km_db!
|
40
|
+
config = DefaultConfig.dup
|
41
|
+
['km_db.yml', 'config/km_db.yml'].each do |config_path|
|
42
|
+
next unless File.exist?(config_path)
|
43
|
+
config.merge! YAML.load(ERB.new(File.open(config_path).read).result)
|
44
|
+
break
|
45
|
+
end
|
46
|
+
establish_connection(config)
|
47
|
+
|
48
|
+
unless connection.table_exists?('events')
|
49
|
+
SetupEventsDatabase.up
|
50
|
+
self.reset_column_information
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'kmdb/custom_record'
|
2
|
+
|
3
|
+
module KMDB
|
4
|
+
class Dumpfile < CustomRecord
|
5
|
+
set_table_name "dumpfiles"
|
6
|
+
|
7
|
+
validates_presence_of :offset
|
8
|
+
validates_presence_of :path
|
9
|
+
|
10
|
+
def set(offset)
|
11
|
+
update_attributes!(:offset => offset)
|
12
|
+
end
|
13
|
+
|
14
|
+
def offset
|
15
|
+
attributes['offset'] || 0
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.get(pathname, job = nil)
|
19
|
+
job ||= 'nil'
|
20
|
+
find_or_create(:path => pathname.cleanpath.to_s, :job => job)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/kmdb/event.rb
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'kmdb/custom_record'
|
2
|
+
require 'kmdb/belongs_to_user'
|
3
|
+
require 'kmdb/has_properties'
|
4
|
+
|
5
|
+
module KMDB
|
6
|
+
class Event < CustomRecord
|
7
|
+
include BelongsToUser
|
8
|
+
include HasProperties
|
9
|
+
|
10
|
+
set_table_name "events"
|
11
|
+
|
12
|
+
named_scope :before, lambda { |date| { :conditions => ["`#{table_name}`.`t` < ?", date] } }
|
13
|
+
named_scope :after, lambda { |date| { :conditions => ["`#{table_name}`.`t` > ?", date] } }
|
14
|
+
|
15
|
+
named_scope :named, lambda { |name| { :conditions => { :n => KMDB::Key.get(name) } } }
|
16
|
+
|
17
|
+
named_scope :by_date, lambda { { :order => "`#{table_name}`.`t` ASC" } }
|
18
|
+
|
19
|
+
# return value of property
|
20
|
+
def prop(name)
|
21
|
+
properties.named(name).first.andand.value
|
22
|
+
end
|
23
|
+
|
24
|
+
def name
|
25
|
+
KMDB::Key.find(n).value
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.record(hash)
|
29
|
+
user_name = hash.delete('_p')
|
30
|
+
user ||= User.get(user_name)
|
31
|
+
raise UserError.new "User missing for '#{user_name}'" unless user.present?
|
32
|
+
|
33
|
+
stamp = Time.at hash.delete('_t')
|
34
|
+
key = Key.get hash.delete('_n')
|
35
|
+
event = create(:t => stamp, :n => key, :user => user)
|
36
|
+
Property.set(hash, stamp, user, event)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
=begin
|
2
|
+
|
3
|
+
KMDB::HasProperties --
|
4
|
+
|
5
|
+
Trait shared by Event and User.
|
6
|
+
|
7
|
+
=end
|
8
|
+
|
9
|
+
module KMDB
|
10
|
+
module HasProperties
|
11
|
+
def self.included(mod)
|
12
|
+
mod.class_eval do
|
13
|
+
has_many :properties, :class_name => 'KMDB::Property'
|
14
|
+
|
15
|
+
named_scope :with_properties, lambda { |*props|
|
16
|
+
direction = props.delete(:exclude_missing) ? 'INNER' : 'LEFT'
|
17
|
+
prop_table = Property.table_name
|
18
|
+
selects = ["`#{table_name}`.*"]
|
19
|
+
joins = []
|
20
|
+
props.each_with_index { |prop,k|
|
21
|
+
temp_name = "#{prop_table}_#{k}"
|
22
|
+
selects << "`#{temp_name}`.`value` AS `#{prop.split.join('_')}`"
|
23
|
+
joins << sanitize_sql_array([%Q{
|
24
|
+
#{direction} JOIN `properties` AS `#{temp_name}`
|
25
|
+
ON `#{table_name}`.id = `#{temp_name}`.event_id
|
26
|
+
AND `#{temp_name}`.`key` = ?}, KMDB::Key.get(prop)])
|
27
|
+
}
|
28
|
+
{ :select => selects.join(', '), :joins => joins.join("\n") }
|
29
|
+
}
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/kmdb/key.rb
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
=begin
|
2
|
+
|
3
|
+
Map strings (event and property names) to unique integers (Key#id) for performance
|
4
|
+
|
5
|
+
=end
|
6
|
+
|
7
|
+
require 'kmdb/custom_record'
|
8
|
+
|
9
|
+
module KMDB
|
10
|
+
class Key < CustomRecord
|
11
|
+
set_table_name "keys"
|
12
|
+
|
13
|
+
has_many :events, :foreign_key => :n, :class_name => 'KMDB::Event', :dependent => :delete_all
|
14
|
+
has_many :properties, :foreign_key => :key, :class_name => 'KMDB::Property', :dependent => :delete_all
|
15
|
+
|
16
|
+
named_scope :has_duplicate, lambda {
|
17
|
+
{
|
18
|
+
:select => "id, string, COUNT(id) AS quantity",
|
19
|
+
:group => :string, :having => "quantity > 1"
|
20
|
+
}
|
21
|
+
}
|
22
|
+
|
23
|
+
def self.get(string)
|
24
|
+
@cache ||= {}
|
25
|
+
@cache[string] ||= get_uncached(string)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Replace each duplicate key ID with its most-used variant
|
29
|
+
def self.fix_duplicates!
|
30
|
+
has_duplicate.map(&:string).each do |string|
|
31
|
+
all_keys = find(:all, :conditions => { :string => string })
|
32
|
+
|
33
|
+
# sort keys by usage
|
34
|
+
all_ids = all_keys.map { |key|
|
35
|
+
[key.id, Event.named(key.id).count + Property.named(key.id).count]
|
36
|
+
}.sort { |k1,k2|
|
37
|
+
k1.second <=> k2.second
|
38
|
+
}.map { |k|
|
39
|
+
k.first
|
40
|
+
}
|
41
|
+
id_to_keep = all_ids.pop
|
42
|
+
$stderr.write "Fixing key '#{string}' #{all_ids.inspect} -> #{id_to_keep.inspect}\n"
|
43
|
+
Event.update_all({ :n => id_to_keep }, ["`events`.`n` IN (?)", all_ids])
|
44
|
+
Property.update_all({ :key => id_to_keep }, ["`properties`.`key` IN (?)", all_ids])
|
45
|
+
Key.delete_all(["id IN (?)", all_ids])
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def self.get_uncached(string)
|
52
|
+
string.size <= MaxStringSize or raise "String is too long"
|
53
|
+
find_or_create(:string => string).id
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
=begin
|
2
|
+
|
3
|
+
Setup a custom database for KissMetrics tracking events.
|
4
|
+
|
5
|
+
=end
|
6
|
+
|
7
|
+
require 'active_record'
|
8
|
+
|
9
|
+
module KMDB
|
10
|
+
class SetupEventsDatabase < ActiveRecord::Migration
|
11
|
+
def self.connection
|
12
|
+
CustomRecord.connection
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.up
|
16
|
+
create_table :events do |t|
|
17
|
+
t.integer :user_id
|
18
|
+
t.integer :n
|
19
|
+
t.datetime :t
|
20
|
+
end
|
21
|
+
add_index :events, [:n]
|
22
|
+
add_index :events, [:user_id]
|
23
|
+
|
24
|
+
|
25
|
+
create_table :keys do |t|
|
26
|
+
t.string :string, :limit => MaxStringSize
|
27
|
+
end
|
28
|
+
add_index :keys, [:string]
|
29
|
+
|
30
|
+
create_table :properties do |t|
|
31
|
+
t.integer :user_id
|
32
|
+
t.integer :event_id
|
33
|
+
t.integer :key
|
34
|
+
t.string :value, :limit => 64
|
35
|
+
t.datetime :t
|
36
|
+
end
|
37
|
+
add_index :properties, [:key]
|
38
|
+
add_index :properties, [:user_id]
|
39
|
+
add_index :properties, [:event_id]
|
40
|
+
|
41
|
+
create_table :users do |t|
|
42
|
+
t.string :name, :limit => 48
|
43
|
+
t.integer :alias_id
|
44
|
+
end
|
45
|
+
add_index :users, [:name]
|
46
|
+
|
47
|
+
create_table :dumpfiles do |t|
|
48
|
+
t.string :path
|
49
|
+
t.string :job
|
50
|
+
t.integer :offset
|
51
|
+
end
|
52
|
+
add_index :dumpfiles, [:path]
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.down
|
57
|
+
drop_table :events
|
58
|
+
drop_table :properties
|
59
|
+
drop_table :users
|
60
|
+
drop_table :aliases
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
require 'kmdb/parser'
|
2
|
+
require 'parallel'
|
3
|
+
|
4
|
+
module KMDB
|
5
|
+
class ParallelParser < Parser
|
6
|
+
|
7
|
+
def initialize(options = {})
|
8
|
+
super(options)
|
9
|
+
@worker_count = options.delete(:workers) || Parallel.processor_count
|
10
|
+
end
|
11
|
+
|
12
|
+
def run(argv)
|
13
|
+
@pipe_rd, @pipe_wr = IO.pipe
|
14
|
+
|
15
|
+
inputs = list_files_in(argv)
|
16
|
+
total_bytes = total_size_of_files(inputs)
|
17
|
+
log "total bytes : #{total_bytes}"
|
18
|
+
total_bytes -= inputs.map { |p| Dumpfile.get(p, @resume_job) }.compact.map(&:offset).sum
|
19
|
+
log "left to process : #{total_bytes}"
|
20
|
+
|
21
|
+
# Start workers
|
22
|
+
log "Using #{@worker_count} workers."
|
23
|
+
Process.fork do
|
24
|
+
@pipe_rd.close
|
25
|
+
Parallel.each(inputs, :in_processes => @worker_count) do |input|
|
26
|
+
KMDB::Event.connection.reconnect!
|
27
|
+
log "Worker #{Process.pid} starting #{input}"
|
28
|
+
$0 = "worker: #{input}"
|
29
|
+
process_events_in_file(input)
|
30
|
+
log "Worker #{Process.pid} done"
|
31
|
+
true
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Start gatherer
|
36
|
+
$0 = "gatherer: #{$0}"
|
37
|
+
@pipe_wr.close
|
38
|
+
byte_counter = 0
|
39
|
+
log "Starting gatherer, total bytes: #{total_bytes}"
|
40
|
+
progress = ProgressBar.new("-" * 20, total_bytes)
|
41
|
+
while line = @pipe_rd.gets
|
42
|
+
if line =~ /^OK (\d+)$/
|
43
|
+
byte_counter += $1.to_i
|
44
|
+
progress.set byte_counter
|
45
|
+
elsif line =~ /^FILE (.*)$/
|
46
|
+
progress.title = $1
|
47
|
+
else
|
48
|
+
log "Unparsed line: '#{line}'"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
progress.finish
|
52
|
+
log "Total bytes processed: #{byte_counter}"
|
53
|
+
Process.waitall
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def process_events_in_file(pathname)
|
59
|
+
pathname.open do |input|
|
60
|
+
processed_bytes = 0
|
61
|
+
if @resume_job
|
62
|
+
dumpfile = Dumpfile.get(pathname, @resume_job)
|
63
|
+
log "Starting file #{pathname} from offset #{dumpfile.offset}"
|
64
|
+
input.seek(dumpfile.offset)
|
65
|
+
end
|
66
|
+
line_number = 0
|
67
|
+
@pipe_wr.write "FILE #{pathname.basename}\n"
|
68
|
+
while line = input.gets
|
69
|
+
line_number += 1
|
70
|
+
processed_bytes += line.size
|
71
|
+
|
72
|
+
process_event(line)
|
73
|
+
dumpfile.set(input.tell)
|
74
|
+
|
75
|
+
if processed_bytes > 100_000
|
76
|
+
@pipe_wr.write "OK #{processed_bytes}\n"
|
77
|
+
processed_bytes = 0
|
78
|
+
end
|
79
|
+
end
|
80
|
+
@pipe_wr.write "OK #{processed_bytes}\n"
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
data/lib/kmdb/parser.rb
ADDED
@@ -0,0 +1,143 @@
|
|
1
|
+
require 'yajl/json_gem'
|
2
|
+
require 'pathname'
|
3
|
+
require 'progressbar'
|
4
|
+
require 'pstore'
|
5
|
+
|
6
|
+
module KMDB
|
7
|
+
class Parser
|
8
|
+
class ProgressBar < ::ProgressBar
|
9
|
+
attr_writer :title
|
10
|
+
end
|
11
|
+
|
12
|
+
attr :resume_job
|
13
|
+
attr :verbose
|
14
|
+
attr :abort_on_error
|
15
|
+
|
16
|
+
def initialize(options = {})
|
17
|
+
@processed_bytes = nil
|
18
|
+
@total_bytes = nil
|
19
|
+
@exclude_regexps = []
|
20
|
+
@include_regexps = []
|
21
|
+
@filters = []
|
22
|
+
@verbose = options.delete(:verbose)
|
23
|
+
@resume_job = options.delete(:resume)
|
24
|
+
@abort_on_error = options.delete(:abort_on_error)
|
25
|
+
|
26
|
+
if @resume_job && @verbose && Dumpfile.count > 0
|
27
|
+
log "Using restart information"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def exclude(regexp)
|
32
|
+
@exclude_regexps << regexp
|
33
|
+
self
|
34
|
+
end
|
35
|
+
|
36
|
+
def only(regexp)
|
37
|
+
@include_regexps << regexp
|
38
|
+
self
|
39
|
+
end
|
40
|
+
|
41
|
+
def add_filter(&block)
|
42
|
+
@filters << block
|
43
|
+
self
|
44
|
+
end
|
45
|
+
|
46
|
+
def run(argv)
|
47
|
+
inputs = list_files_in(argv)
|
48
|
+
total_bytes = total_size_of_files(inputs)
|
49
|
+
log "total bytes : #{total_bytes}"
|
50
|
+
total_bytes -= inputs.map { |p| Dumpfile.get(p, @resume_job) }.compact.map(&:offset).sum
|
51
|
+
log "left to process : #{total_bytes}"
|
52
|
+
|
53
|
+
@processed_bytes = 0
|
54
|
+
@progress = ProgressBar.new("-" * 20, total_bytes)
|
55
|
+
@progress.long_running if @progress.respond_to?(:long_running)
|
56
|
+
|
57
|
+
inputs.sort.each do |input|
|
58
|
+
process_events_in_file(input)
|
59
|
+
end
|
60
|
+
|
61
|
+
@progress.finish
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
def log(message)
|
67
|
+
$stderr.write(message + "\n") if @verbose
|
68
|
+
end
|
69
|
+
|
70
|
+
def process_event(text)
|
71
|
+
return if @exclude_regexps.any? { |re| text =~ re }
|
72
|
+
return unless @include_regexps.all? { |re| text =~ re }
|
73
|
+
|
74
|
+
# filter strange utf-8 encoding/escaping found in KM dumps
|
75
|
+
if text =~ /\\30[3-5]\\[0-9]{3}/
|
76
|
+
begin
|
77
|
+
text = eval("%Q(#{text})")
|
78
|
+
rescue SyntaxError => e
|
79
|
+
log "Syntax error in: #{text}"
|
80
|
+
raise e if @abort_on_error
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
begin
|
85
|
+
data = JSON.parse(text)
|
86
|
+
rescue JSON::ParserError => e
|
87
|
+
log "Warning, JSON parse error in: #{text}"
|
88
|
+
raise e if @abort_on_error
|
89
|
+
return
|
90
|
+
end
|
91
|
+
|
92
|
+
if data.nil?
|
93
|
+
log "Warning, JSON parse failed in: #{text}"
|
94
|
+
return
|
95
|
+
end
|
96
|
+
|
97
|
+
@filters.each do |filter|
|
98
|
+
data = filter.call(text, data) or break
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def process_events_in_file(pathname)
|
103
|
+
pathname.open do |input|
|
104
|
+
@progress.title = pathname.basename.to_s
|
105
|
+
if @resume_job
|
106
|
+
dumpfile = Dumpfile.get(pathname, @resume_job)
|
107
|
+
log "Starting file #{pathname} from offset #{dumpfile.offset}"
|
108
|
+
input.seek(dumpfile.offset)
|
109
|
+
end
|
110
|
+
line_number = 0
|
111
|
+
while line = input.gets
|
112
|
+
@processed_bytes += line.size
|
113
|
+
@progress.set @processed_bytes if line_number % 100 == 0
|
114
|
+
line_number += 1
|
115
|
+
|
116
|
+
process_event(line)
|
117
|
+
dumpfile.set(input.tell) if @resume_job
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def total_size_of_files(inputs)
|
123
|
+
inputs.map { |c| c.stat.size }.inject(0) { |a,b| a+b }
|
124
|
+
end
|
125
|
+
|
126
|
+
def list_files_in_directory(directory)
|
127
|
+
input_fns = []
|
128
|
+
directory.find do |input_pn|
|
129
|
+
input_pn.to_s =~ /\.json$/ or next
|
130
|
+
input_fns << input_pn
|
131
|
+
end
|
132
|
+
input_fns.sort
|
133
|
+
end
|
134
|
+
|
135
|
+
def list_files_in(argv)
|
136
|
+
argv.map { |arg| Pathname.new(arg) }.map { |pn|
|
137
|
+
pn.exist? and pn or raise "No such file or directory '#{pn}'"
|
138
|
+
}.map { |pn|
|
139
|
+
pn.directory? ? list_files_in_directory(pn) : pn
|
140
|
+
}.flatten
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'kmdb/belongs_to_user'
|
2
|
+
|
3
|
+
module KMDB
|
4
|
+
class Property < CustomRecord
|
5
|
+
include BelongsToUser
|
6
|
+
|
7
|
+
set_table_name "properties"
|
8
|
+
belongs_to :event, :class_name => 'KMDB::Event'
|
9
|
+
|
10
|
+
default_scope :order => 't DESC'
|
11
|
+
named_scope :named, lambda { |name| { :conditions => { :key => KMDB::Key.get(name) } } }
|
12
|
+
|
13
|
+
def self.set(hash, stamp=nil, user=nil, event=nil)
|
14
|
+
user_name = hash.delete('_p')
|
15
|
+
user ||= User.get(user_name)
|
16
|
+
raise UserError.new "User missing for '#{user_name}'" unless user.present?
|
17
|
+
|
18
|
+
event_id = event ? event.id : nil
|
19
|
+
stamp = Time.at hash.delete('_t') || stamp
|
20
|
+
|
21
|
+
return if hash.empty?
|
22
|
+
sql_insert = "INSERT INTO `#{table_name}` (`t`,`user_id`,`event_id`,`key`,`value`) VALUES "
|
23
|
+
sql_values = []
|
24
|
+
|
25
|
+
hash.each_pair do |prop_name,value|
|
26
|
+
key = Key.get(prop_name)
|
27
|
+
sql_values << sanitize_sql_array(["(?,?,?,?,?)", stamp,user.id,event_id,key,value])
|
28
|
+
end
|
29
|
+
|
30
|
+
connection.execute(sql_insert + sql_values.join(","))
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/kmdb/user.rb
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'kmdb/has_properties'
|
2
|
+
|
3
|
+
module KMDB
|
4
|
+
class User < CustomRecord
|
5
|
+
include HasProperties
|
6
|
+
|
7
|
+
set_table_name "users"
|
8
|
+
|
9
|
+
has_many :events, :class_name => 'KMDB::Event'
|
10
|
+
belongs_to :alias, :class_name => 'KMDB::User'
|
11
|
+
# points to the aliased user. if set, no properties/events should belong to this user
|
12
|
+
|
13
|
+
validates_presence_of :name
|
14
|
+
validates_uniqueness_of :name
|
15
|
+
|
16
|
+
named_scope :named, lambda { |name| { :conditions => { :name => name } } }
|
17
|
+
|
18
|
+
named_scope :duplicates, lambda {{
|
19
|
+
:select => "id, COUNT(id) AS quantity", :group => :name, :having => "quantity > 1"
|
20
|
+
}}
|
21
|
+
|
22
|
+
# return (latest) value of property
|
23
|
+
def prop(name)
|
24
|
+
properties.named(name).first.andand.value
|
25
|
+
end
|
26
|
+
|
27
|
+
# mark this user as aliasing another
|
28
|
+
def aliases!(other)
|
29
|
+
[Property,Event].each do |model|
|
30
|
+
model.user_is(self).update_all({:user_id => other.id})
|
31
|
+
end
|
32
|
+
self.update_attributes!(:alias => other)
|
33
|
+
end
|
34
|
+
|
35
|
+
# return the user named `name` (creating it if necessary)
|
36
|
+
# if `name` is an alias, return the original user
|
37
|
+
def self.get(name)
|
38
|
+
user = named(name).first || create(:name => name)
|
39
|
+
user = user.alias while user.alias
|
40
|
+
return user
|
41
|
+
end
|
42
|
+
|
43
|
+
|
44
|
+
# mark the two names as pointing to the same user
|
45
|
+
def self.alias!(name1, name2)
|
46
|
+
u1 = get(name1)
|
47
|
+
u2 = get(name2)
|
48
|
+
$stderr.write "Warning: user '#{user.name}' has an alias\n" if u1.alias
|
49
|
+
$stderr.write "Warning: user '#{user.name}' has an alias\n" if u2.alias
|
50
|
+
|
51
|
+
# nothing to do if both names already point to the same user
|
52
|
+
return if u1 == u2
|
53
|
+
|
54
|
+
u2.aliases! u1
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
# duplication can occur during parallel imports because we're not running transactionally.
|
59
|
+
def self.fix_duplicates!
|
60
|
+
duplicates.map(&:name).each do |name|
|
61
|
+
named(name).all.tap do |all_users|
|
62
|
+
kept_user = all_users.pop
|
63
|
+
all_users.each do |user|
|
64
|
+
user.aliases! kept_user
|
65
|
+
user.destroy
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
# detect alias chains
|
73
|
+
def self.resolve_alias_chains!
|
74
|
+
find(:all, :joins => :alias, :conditions => 'aliases_users.alias_id IS NOT NULL').each do |user|
|
75
|
+
user = find(user.id)
|
76
|
+
origin = find(user.alias_id)
|
77
|
+
origin = origin.alias while origin.alias # go up the chain
|
78
|
+
$stderr.write "Aliasing #{user.name} -> #{origin.name}\n"
|
79
|
+
user.aliases!(origin)
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
data/lib/kmdb/version.rb
ADDED
data/lib/kmdb.rb
ADDED
metadata
ADDED
@@ -0,0 +1,234 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: km-db
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 21
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 2
|
9
|
+
- 1
|
10
|
+
version: 0.2.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- HouseTrip
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2013-03-23 00:00:00 +00:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
23
|
+
none: false
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
hash: 23
|
28
|
+
segments:
|
29
|
+
- 1
|
30
|
+
- 0
|
31
|
+
- 0
|
32
|
+
version: 1.0.0
|
33
|
+
prerelease: false
|
34
|
+
name: bundler
|
35
|
+
type: :development
|
36
|
+
version_requirements: *id001
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ~>
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
hash: 31
|
44
|
+
segments:
|
45
|
+
- 2
|
46
|
+
- 4
|
47
|
+
- 0
|
48
|
+
version: 2.4.0
|
49
|
+
prerelease: false
|
50
|
+
name: rspec
|
51
|
+
type: :development
|
52
|
+
version_requirements: *id002
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
hash: 3
|
60
|
+
segments:
|
61
|
+
- 0
|
62
|
+
version: "0"
|
63
|
+
prerelease: false
|
64
|
+
name: rake
|
65
|
+
type: :development
|
66
|
+
version_requirements: *id003
|
67
|
+
- !ruby/object:Gem::Dependency
|
68
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
69
|
+
none: false
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
hash: 3
|
74
|
+
segments:
|
75
|
+
- 0
|
76
|
+
version: "0"
|
77
|
+
prerelease: false
|
78
|
+
name: json
|
79
|
+
type: :development
|
80
|
+
version_requirements: *id004
|
81
|
+
- !ruby/object:Gem::Dependency
|
82
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
84
|
+
requirements:
|
85
|
+
- - ">="
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
hash: 3
|
88
|
+
segments:
|
89
|
+
- 0
|
90
|
+
version: "0"
|
91
|
+
prerelease: false
|
92
|
+
name: sqlite3-ruby
|
93
|
+
type: :development
|
94
|
+
version_requirements: *id005
|
95
|
+
- !ruby/object:Gem::Dependency
|
96
|
+
requirement: &id006 !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ">="
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
hash: 3
|
102
|
+
segments:
|
103
|
+
- 0
|
104
|
+
version: "0"
|
105
|
+
prerelease: false
|
106
|
+
name: yajl-ruby
|
107
|
+
type: :runtime
|
108
|
+
version_requirements: *id006
|
109
|
+
- !ruby/object:Gem::Dependency
|
110
|
+
requirement: &id007 !ruby/object:Gem::Requirement
|
111
|
+
none: false
|
112
|
+
requirements:
|
113
|
+
- - ">="
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
hash: 3
|
116
|
+
segments:
|
117
|
+
- 0
|
118
|
+
version: "0"
|
119
|
+
prerelease: false
|
120
|
+
name: progressbar
|
121
|
+
type: :runtime
|
122
|
+
version_requirements: *id007
|
123
|
+
- !ruby/object:Gem::Dependency
|
124
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
125
|
+
none: false
|
126
|
+
requirements:
|
127
|
+
- - ">="
|
128
|
+
- !ruby/object:Gem::Version
|
129
|
+
hash: 3
|
130
|
+
segments:
|
131
|
+
- 0
|
132
|
+
version: "0"
|
133
|
+
prerelease: false
|
134
|
+
name: parallel
|
135
|
+
type: :runtime
|
136
|
+
version_requirements: *id008
|
137
|
+
- !ruby/object:Gem::Dependency
|
138
|
+
requirement: &id009 !ruby/object:Gem::Requirement
|
139
|
+
none: false
|
140
|
+
requirements:
|
141
|
+
- - ">="
|
142
|
+
- !ruby/object:Gem::Version
|
143
|
+
hash: 3
|
144
|
+
segments:
|
145
|
+
- 0
|
146
|
+
version: "0"
|
147
|
+
prerelease: false
|
148
|
+
name: andand
|
149
|
+
type: :runtime
|
150
|
+
version_requirements: *id009
|
151
|
+
- !ruby/object:Gem::Dependency
|
152
|
+
requirement: &id010 !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
154
|
+
requirements:
|
155
|
+
- - ~>
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
hash: 27
|
158
|
+
segments:
|
159
|
+
- 2
|
160
|
+
- 3
|
161
|
+
- 12
|
162
|
+
version: 2.3.12
|
163
|
+
prerelease: false
|
164
|
+
name: activerecord
|
165
|
+
type: :runtime
|
166
|
+
version_requirements: *id010
|
167
|
+
description: Process KISSmetrics data dumps
|
168
|
+
email:
|
169
|
+
- jtl@housetrip.com
|
170
|
+
executables:
|
171
|
+
- km_db_import
|
172
|
+
extensions: []
|
173
|
+
|
174
|
+
extra_rdoc_files: []
|
175
|
+
|
176
|
+
files:
|
177
|
+
- Gemfile
|
178
|
+
- Gemfile.lock
|
179
|
+
- README.markdown
|
180
|
+
- Rakefile
|
181
|
+
- bin/km_db_import
|
182
|
+
- km-db.gemspec
|
183
|
+
- lib/kmdb.rb
|
184
|
+
- lib/kmdb/belongs_to_user.rb
|
185
|
+
- lib/kmdb/custom_record.rb
|
186
|
+
- lib/kmdb/dumpfile.rb
|
187
|
+
- lib/kmdb/event.rb
|
188
|
+
- lib/kmdb/has_properties.rb
|
189
|
+
- lib/kmdb/key.rb
|
190
|
+
- lib/kmdb/migration.rb
|
191
|
+
- lib/kmdb/parallel_parser.rb
|
192
|
+
- lib/kmdb/parser.rb
|
193
|
+
- lib/kmdb/property.rb
|
194
|
+
- lib/kmdb/user.rb
|
195
|
+
- lib/kmdb/user_error.rb
|
196
|
+
- lib/kmdb/version.rb
|
197
|
+
has_rdoc: true
|
198
|
+
homepage: https://github.com/housetrip/km-db
|
199
|
+
licenses: []
|
200
|
+
|
201
|
+
post_install_message:
|
202
|
+
rdoc_options: []
|
203
|
+
|
204
|
+
require_paths:
|
205
|
+
- lib
|
206
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
207
|
+
none: false
|
208
|
+
requirements:
|
209
|
+
- - ">="
|
210
|
+
- !ruby/object:Gem::Version
|
211
|
+
hash: 3
|
212
|
+
segments:
|
213
|
+
- 0
|
214
|
+
version: "0"
|
215
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
216
|
+
none: false
|
217
|
+
requirements:
|
218
|
+
- - ">="
|
219
|
+
- !ruby/object:Gem::Version
|
220
|
+
hash: 23
|
221
|
+
segments:
|
222
|
+
- 1
|
223
|
+
- 3
|
224
|
+
- 6
|
225
|
+
version: 1.3.6
|
226
|
+
requirements: []
|
227
|
+
|
228
|
+
rubyforge_project:
|
229
|
+
rubygems_version: 1.3.9.5
|
230
|
+
signing_key:
|
231
|
+
specification_version: 3
|
232
|
+
summary: Process KISSmetrics data dumps
|
233
|
+
test_files: []
|
234
|
+
|