nfcollector 3.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.rspec +3 -0
- data/.rvmrc +1 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +1 -0
- data/lib/nfcollector.rb +41 -0
- data/lib/nfcollector/attribute_validator.rb +59 -0
- data/lib/nfcollector/attributes.rb +99 -0
- data/lib/nfcollector/categoriser.rb +43 -0
- data/lib/nfcollector/category_partition.rb +17 -0
- data/lib/nfcollector/configuration.rb +24 -0
- data/lib/nfcollector/copy_file_writer.rb +47 -0
- data/lib/nfcollector/domain_parser.rb +49 -0
- data/lib/nfcollector/input_definition.rb +31 -0
- data/lib/nfcollector/mapping.rb +7 -0
- data/lib/nfcollector/mapping/categories_processor.rb +36 -0
- data/lib/nfcollector/mapping/column_transpiler.rb +29 -0
- data/lib/nfcollector/mapping/default_output.rb +45 -0
- data/lib/nfcollector/mapping/effective_tld_names.dat +4394 -0
- data/lib/nfcollector/mapping/indexer.rb +21 -0
- data/lib/nfcollector/mapping/mapped_row.rb +21 -0
- data/lib/nfcollector/mapping/output.rb +59 -0
- data/lib/nfcollector/mapping/transpiler.rb +92 -0
- data/lib/nfcollector/nfcollector_exception.rb +4 -0
- data/lib/nfcollector/partition.rb +76 -0
- data/lib/nfcollector/partitioner.rb +37 -0
- data/lib/nfcollector/payload_processor.rb +46 -0
- data/lib/nfcollector/sequence_generator.rb +11 -0
- data/lib/nfcollector/version.rb +3 -0
- data/lib/nfcollector/weblog_partition.rb +26 -0
- data/nfcollector.gemspec +30 -0
- data/spec/attribute_validator_spec.rb +23 -0
- data/spec/attributes_spec.rb +15 -0
- data/spec/command_parser_spec.rb +81 -0
- data/spec/copy_file_writer_spec.rb +95 -0
- data/spec/input_definition_spec.rb +18 -0
- data/spec/nfcollector/category_partitioner_spec.rb +51 -0
- data/spec/nfcollector/date_partitioner_spec.rb +19 -0
- data/spec/nfcollector/input_definition_spec.rb +32 -0
- data/spec/nfcollector/mapping/column_transpiler_spec.rb +26 -0
- data/spec/nfcollector/mapping/output_spec.rb +76 -0
- data/spec/nfcollector/mapping/transpiler_spec.rb +47 -0
- data/spec/payload_job_spec.rb +11 -0
- data/spec/payload_processor_spec.rb +114 -0
- data/spec/spec_helper.rb +89 -0
- data/test/domains_hosts +194826 -0
- data/test/generate_input.rb +79 -0
- data/test/input/input-1000.csv +1000 -0
- data/test/input/input-100000.csv +100000 -0
- data/test/input/input-100000.dat +64039 -0
- data/test/input/input-no-tags.csv +3 -0
- data/test/input/input-no-tags.dat +3 -0
- data/test/input/input-no-tags.gz +0 -0
- data/test/input/input-with-tags.csv.gz +0 -0
- data/test/test_helper.rb +15 -0
- data/test/tester.rb +32 -0
- metadata +252 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
module Nfcollector
|
2
|
+
module Mapping
|
3
|
+
class Indexer
|
4
|
+
def initialize(account_id)
|
5
|
+
@users = {}
|
6
|
+
@account_id = account_id
|
7
|
+
end
|
8
|
+
|
9
|
+
def perform(mapped_row, indicies)
|
10
|
+
username, _ = mapped_row.values_at(*indicies)
|
11
|
+
# May seem clunky but hashes are much faster than Sets
|
12
|
+
# (plus we can store other things here later)
|
13
|
+
@users[username] = nil
|
14
|
+
end
|
15
|
+
|
16
|
+
def commit!
|
17
|
+
Configuration.user_committer.commit(@account_id, @users)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Nfcollector
|
2
|
+
module Mapping
|
3
|
+
class MappedRow < Array
|
4
|
+
attr_writer :category_ids
|
5
|
+
attr_accessor :id
|
6
|
+
|
7
|
+
def initialize(*args)
|
8
|
+
super()
|
9
|
+
concat(args)
|
10
|
+
end
|
11
|
+
|
12
|
+
def category_ids
|
13
|
+
@category_ids || []
|
14
|
+
end
|
15
|
+
|
16
|
+
def inspect
|
17
|
+
Array([@id, super]).compact.join(":")
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module Nfcollector
|
2
|
+
module Mapping
|
3
|
+
# TODO: Consider merging this and DefaultOutput
|
4
|
+
class Output
|
5
|
+
class << self
|
6
|
+
attr_accessor :categories_processor
|
7
|
+
|
8
|
+
def outputs
|
9
|
+
@output ||= []
|
10
|
+
end
|
11
|
+
|
12
|
+
def output(name, options = {})
|
13
|
+
self.outputs << new(name, options)
|
14
|
+
end
|
15
|
+
|
16
|
+
def categories(options = {})
|
17
|
+
self.categories_processor = CategoriesProcessor.new(options)
|
18
|
+
end
|
19
|
+
|
20
|
+
def build_transpiler(input_definition, column_options = {})
|
21
|
+
Transpiler.new(input_definition, column_options).tap do |transpiler|
|
22
|
+
outputs.each do |output|
|
23
|
+
transpiler.add_column(output)
|
24
|
+
end
|
25
|
+
transpiler.process_categories_with(self.categories_processor)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
attr_reader :name
|
31
|
+
attr_reader :inputs
|
32
|
+
attr_reader :process_with
|
33
|
+
|
34
|
+
def initialize(name, options = {})
|
35
|
+
@name = name.to_sym
|
36
|
+
@inputs = Array(options.fetch(:inputs, @name))
|
37
|
+
@process_with = options.fetch(:process_with, @name)
|
38
|
+
@pkey = options[:primary_key]
|
39
|
+
@optional = options[:optional]
|
40
|
+
end
|
41
|
+
|
42
|
+
def optional?
|
43
|
+
@optional == true
|
44
|
+
end
|
45
|
+
|
46
|
+
def required?
|
47
|
+
!optional?
|
48
|
+
end
|
49
|
+
|
50
|
+
def primary_key?
|
51
|
+
@pkey == true
|
52
|
+
end
|
53
|
+
|
54
|
+
def method_missing(method, arg, options = {})
|
55
|
+
return arg
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
|
2
|
+
module Nfcollector
|
3
|
+
module Mapping
|
4
|
+
class AfterRowHook
|
5
|
+
# Represents a hook that is called after each row has been transpiled
|
6
|
+
# @param [Object,Class] processor an object or class that responds to the perform method
|
7
|
+
# @param [Array<Integer>] an array of indicies from which the processing arguments are taken in the transpiled row
|
8
|
+
def initialize(processor, indicies)
|
9
|
+
@processor = processor
|
10
|
+
@indicies = indicies
|
11
|
+
end
|
12
|
+
|
13
|
+
def perform(mapped_row)
|
14
|
+
@processor.perform(mapped_row, @indicies)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
class Transpiler
|
19
|
+
attr_reader :outputs
|
20
|
+
|
21
|
+
def initialize(input_definition, column_options = {})
|
22
|
+
@input_definition = input_definition
|
23
|
+
@outputs = []
|
24
|
+
@after_row_hooks = []
|
25
|
+
@column_options = column_options
|
26
|
+
end
|
27
|
+
|
28
|
+
def headers
|
29
|
+
@outputs.map(&:name)
|
30
|
+
end
|
31
|
+
|
32
|
+
def add_column(output)
|
33
|
+
# TODO: Make this output.should_transpile?(@input_definition)
|
34
|
+
if output.inputs.blank? || @input_definition.has_index_for?(output.inputs) || output.required?
|
35
|
+
column_transpiler = ColumnTranspiler.new(output, @column_options)
|
36
|
+
column_transpiler.build!(@input_definition)
|
37
|
+
@outputs << column_transpiler
|
38
|
+
if output.primary_key?
|
39
|
+
@primary_key_index = @outputs.size - 1
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def process_categories_with(categories_processor)
|
45
|
+
@categories_processor = categories_processor
|
46
|
+
if @input_definition.has_index_for?(:category) || @categories_processor.required?
|
47
|
+
@category_index = @input_definition.column_index(:category)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def after_row(processor, options = {})
|
52
|
+
required_columns = options[:using]
|
53
|
+
indicies = Array(required_columns).map { |column| index_of(column) }
|
54
|
+
@after_row_hooks << AfterRowHook.new(processor, indicies)
|
55
|
+
end
|
56
|
+
|
57
|
+
# @param [Symbol] name of the column
|
58
|
+
# @return the index of the column name in the output
|
59
|
+
def index_of(column)
|
60
|
+
@outputs.index { |output| output.name == column }
|
61
|
+
end
|
62
|
+
|
63
|
+
def transpile(row)
|
64
|
+
MappedRow.new.tap do |out|
|
65
|
+
@outputs.each do |ct|
|
66
|
+
out << ct.go(row)
|
67
|
+
end
|
68
|
+
out.category_ids = process_categories(row)
|
69
|
+
out.id = primary_key(out)
|
70
|
+
process_after_row_hooks(out)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def process_after_row_hooks(mapped_row)
|
75
|
+
@after_row_hooks.each do |hook|
|
76
|
+
hook.perform(mapped_row)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def process_categories(row)
|
81
|
+
if @categories_processor.present? && @category_index.present?
|
82
|
+
@categories_processor.perform(row[@category_index])
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def primary_key(output_row)
|
87
|
+
return nil if @primary_key_index.blank?
|
88
|
+
output_row[@primary_key_index]
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
|
2
|
+
module Nfcollector
|
3
|
+
class Partition
|
4
|
+
attr_reader :rows
|
5
|
+
attr_reader :partition_id
|
6
|
+
attr_reader :headers
|
7
|
+
attr_reader :account_id
|
8
|
+
|
9
|
+
# Instantiates a partitioner that will generate partitions
|
10
|
+
# of this type
|
11
|
+
#
|
12
|
+
# @param [Integer,String] account_id the account_id for which we are processing data
|
13
|
+
# @param [Integer] partition_key_index the index in each row of the partition key
|
14
|
+
# @param [Array] headers an array of headers for writing the data to file
|
15
|
+
#
|
16
|
+
def self.get_partitioner(account_id, partition_key_index, headers)
|
17
|
+
Partitioner.new(self, account_id, partition_key_index).tap do |part|
|
18
|
+
part.set_headers!(headers)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# The ActiveRecord class used to back this partition type
|
23
|
+
class_attribute :ar_klass
|
24
|
+
|
25
|
+
# Create a new partition with the given values
|
26
|
+
# @param [Array] values
|
27
|
+
#
|
28
|
+
def initialize(value, account_id, headers)
|
29
|
+
@value = value
|
30
|
+
@account_id = account_id
|
31
|
+
@headers = headers
|
32
|
+
@rows = []
|
33
|
+
end
|
34
|
+
|
35
|
+
def partition_id
|
36
|
+
self.class.partition_id(@value)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Adds a row to the partition
|
40
|
+
# @param [MappedRow] row
|
41
|
+
#
|
42
|
+
def add_row(row)
|
43
|
+
# TODO: Is there a cleaner way to do this??
|
44
|
+
@rows << [ @account_id ] + row
|
45
|
+
end
|
46
|
+
|
47
|
+
# Return the table name that should be used for the data
|
48
|
+
# in this partition
|
49
|
+
# @param [Integer,String] account_id
|
50
|
+
#
|
51
|
+
def table_name
|
52
|
+
return @table_name if @table_name
|
53
|
+
partition = ar_klass.partitions.find_or_create_for(keys)
|
54
|
+
@table_name = partition.nil? ? ar_klass.table_name : partition.name
|
55
|
+
end
|
56
|
+
|
57
|
+
def file_name
|
58
|
+
@file_name ||= File.expand_path("#{table_name}_#{randstr}.copy", Configuration.output_dir)
|
59
|
+
end
|
60
|
+
|
61
|
+
# @return [Hash] a set of keys used to generate the table for this partition
|
62
|
+
#
|
63
|
+
def keys
|
64
|
+
raise NotImplemented
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
def randstr
|
69
|
+
"".tap do |str|
|
70
|
+
8.times do
|
71
|
+
str << ((rand * 25).to_i + 97).chr
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Nfcollector
|
2
|
+
class Partitioner
|
3
|
+
attr_reader :column_indexes
|
4
|
+
attr_reader :data
|
5
|
+
|
6
|
+
# TODO: use 2.1 named parameters when we can guarantee Ruby 2.1
|
7
|
+
def initialize(partition_klass, account_id, column_index)
|
8
|
+
@partition_klass = partition_klass
|
9
|
+
@account_id = account_id
|
10
|
+
@column_index = column_index
|
11
|
+
@data = {}
|
12
|
+
end
|
13
|
+
|
14
|
+
def set_headers!(headers)
|
15
|
+
@headers = headers
|
16
|
+
end
|
17
|
+
|
18
|
+
def add_row(row)
|
19
|
+
p_value = row_value(row)
|
20
|
+
p_id = @partition_klass.partition_id(p_value)
|
21
|
+
@data[p_id] ||= @partition_klass.new(p_value, @account_id, @headers)
|
22
|
+
@data[p_id].add_row(row)
|
23
|
+
end
|
24
|
+
|
25
|
+
def row_value(row)
|
26
|
+
row[@column_index]
|
27
|
+
end
|
28
|
+
|
29
|
+
def commit!(account_id)
|
30
|
+
@data.map do |(key,partition)|
|
31
|
+
CopyFileWriter.open(partition) do |copy|
|
32
|
+
copy.commit!
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module Nfcollector
|
2
|
+
class PayloadProcessor
|
3
|
+
|
4
|
+
def initialize(account_id, tz, attributes_string)
|
5
|
+
@account_id = account_id
|
6
|
+
@tz = tz
|
7
|
+
input_definition = Attributes.parse(attributes_string)
|
8
|
+
# TODO: Will need to specify the Output class to use later
|
9
|
+
@transpiler = Mapping::DefaultOutput.build_transpiler(input_definition, tz: @tz)
|
10
|
+
@categoriser = Categoriser.new(account_id)
|
11
|
+
@indexer = Mapping::Indexer.new(account_id)
|
12
|
+
# TODO: Do the category lookups inside the categoriser and pass it the categories string
|
13
|
+
# (Only do this if the input definition has the categories - actuall ALL the required inputs)
|
14
|
+
@transpiler.after_row(@categoriser, using: [ :domain, :host, :created_at ])
|
15
|
+
@transpiler.after_row(@indexer, using: [ :username ])
|
16
|
+
end
|
17
|
+
|
18
|
+
def process_payload(io)
|
19
|
+
# TODO: Look at streaming with GzipReader#read_partial later (not the lowest hanging fruit right now)
|
20
|
+
gz = Zlib::GzipReader.new(io)
|
21
|
+
strio = StringIO.new(gz.read)
|
22
|
+
gz.close
|
23
|
+
|
24
|
+
weblog_partitioner = WeblogPartition.get_partitioner(@account_id, @transpiler.index_of(:created_at), @transpiler.headers)
|
25
|
+
|
26
|
+
# Improvements for the future
|
27
|
+
# - Switched from excelsior to CSV because it was segfaulting under the latest Ruby
|
28
|
+
# - Ideally we would use ccsv here but it cannot read from an IO, only a file
|
29
|
+
# We could look at using sidekiq with Pure Ruby
|
30
|
+
# or have a job unzip to a second file first (not very memory efficient, and error prone)
|
31
|
+
# Or look at rewriting the collector in Go and use http://www.goworker.org/ (Ideal but lots of work)
|
32
|
+
#
|
33
|
+
csv = CSV.new(strio)
|
34
|
+
while !csv.eof?
|
35
|
+
mapped_row = @transpiler.transpile(csv.readline) # TODO: rename the transpile method
|
36
|
+
weblog_partitioner.add_row(mapped_row)
|
37
|
+
end
|
38
|
+
|
39
|
+
# TODO: These could write on the fly in time
|
40
|
+
cfws = weblog_partitioner.commit!(@account_id)
|
41
|
+
@categoriser.commit!
|
42
|
+
@indexer.commit!
|
43
|
+
return cfws
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
|
2
|
+
module Nfcollector
|
3
|
+
class WeblogPartition < Partition
|
4
|
+
|
5
|
+
# Calculates the ID for this partition
|
6
|
+
# Think of it as a statistical 'bin'
|
7
|
+
#
|
8
|
+
# In this case we take the date from a timestamp
|
9
|
+
#
|
10
|
+
def self.partition_id(value)
|
11
|
+
Date.new(value.year, value.month, value.mday)
|
12
|
+
end
|
13
|
+
|
14
|
+
def keys
|
15
|
+
{ account_id: account_id, created_at: range_from_date }
|
16
|
+
end
|
17
|
+
|
18
|
+
def range_from_date
|
19
|
+
date = partition_id
|
20
|
+
next_date = partition_id.next
|
21
|
+
start_time = Time.utc(date.year, date.month, date.mday)
|
22
|
+
end_time = Time.utc(next_date.year, next_date.month, next_date.mday)
|
23
|
+
start_time...end_time
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
data/nfcollector.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'nfcollector/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "nfcollector"
|
8
|
+
gem.version = Nfcollector::VERSION
|
9
|
+
gem.authors = ["Dan Draper"]
|
10
|
+
gem.email = ["daniel@codefire.com"]
|
11
|
+
gem.description = %q{Data collection system for NetFox}
|
12
|
+
gem.summary = %q{Data Collection}
|
13
|
+
gem.homepage = ""
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
|
20
|
+
gem.add_development_dependency 'test-unit'
|
21
|
+
gem.add_development_dependency 'rspec'
|
22
|
+
gem.add_development_dependency 'shoulda'
|
23
|
+
gem.add_development_dependency 'timecop'
|
24
|
+
gem.add_development_dependency 'differ'
|
25
|
+
gem.add_development_dependency 'byebug'
|
26
|
+
gem.add_development_dependency 'activerecord'
|
27
|
+
|
28
|
+
gem.add_runtime_dependency 'domainatrix', '>= 0.0.10'
|
29
|
+
gem.add_runtime_dependency 'activesupport', '>= 3.2'
|
30
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Nfcollector::AttributeValidator do
|
4
|
+
subject(:validator) { Nfcollector::AttributeValidator.new }
|
5
|
+
|
6
|
+
specify "raise if a duplicate attribute is provided" do
|
7
|
+
expect {
|
8
|
+
subject.validate(">a,Rr,Rh,Rp,>a,>A,t,Un,Hs,mt,<s,Cs")
|
9
|
+
}.to raise_error(Nfcollector::DuplicateAttribute)
|
10
|
+
end
|
11
|
+
|
12
|
+
specify "raise if an unknown attribute is provided" do
|
13
|
+
expect {
|
14
|
+
subject.validate(">a,Rr,Rh,Rp,>A,t,Un,Hs,mt,<s,Cs,GG")
|
15
|
+
}.to raise_error(Nfcollector::UnknownAttribute, "Unknown Attribute GG")
|
16
|
+
end
|
17
|
+
|
18
|
+
specify "raise if a required attribute is NOT provided" do
|
19
|
+
expect {
|
20
|
+
subject.validate(">a,Rr,Rh,Rp,>A,Un,Hs,mt,<s,Cs")
|
21
|
+
}.to raise_error(Nfcollector::MissingRequiredAttribute, "Missing attribute 't'")
|
22
|
+
end
|
23
|
+
end
|