nfcollector 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.rspec +3 -0
- data/.rvmrc +1 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +1 -0
- data/lib/nfcollector.rb +41 -0
- data/lib/nfcollector/attribute_validator.rb +59 -0
- data/lib/nfcollector/attributes.rb +99 -0
- data/lib/nfcollector/categoriser.rb +43 -0
- data/lib/nfcollector/category_partition.rb +17 -0
- data/lib/nfcollector/configuration.rb +24 -0
- data/lib/nfcollector/copy_file_writer.rb +47 -0
- data/lib/nfcollector/domain_parser.rb +49 -0
- data/lib/nfcollector/input_definition.rb +31 -0
- data/lib/nfcollector/mapping.rb +7 -0
- data/lib/nfcollector/mapping/categories_processor.rb +36 -0
- data/lib/nfcollector/mapping/column_transpiler.rb +29 -0
- data/lib/nfcollector/mapping/default_output.rb +45 -0
- data/lib/nfcollector/mapping/effective_tld_names.dat +4394 -0
- data/lib/nfcollector/mapping/indexer.rb +21 -0
- data/lib/nfcollector/mapping/mapped_row.rb +21 -0
- data/lib/nfcollector/mapping/output.rb +59 -0
- data/lib/nfcollector/mapping/transpiler.rb +92 -0
- data/lib/nfcollector/nfcollector_exception.rb +4 -0
- data/lib/nfcollector/partition.rb +76 -0
- data/lib/nfcollector/partitioner.rb +37 -0
- data/lib/nfcollector/payload_processor.rb +46 -0
- data/lib/nfcollector/sequence_generator.rb +11 -0
- data/lib/nfcollector/version.rb +3 -0
- data/lib/nfcollector/weblog_partition.rb +26 -0
- data/nfcollector.gemspec +30 -0
- data/spec/attribute_validator_spec.rb +23 -0
- data/spec/attributes_spec.rb +15 -0
- data/spec/command_parser_spec.rb +81 -0
- data/spec/copy_file_writer_spec.rb +95 -0
- data/spec/input_definition_spec.rb +18 -0
- data/spec/nfcollector/category_partitioner_spec.rb +51 -0
- data/spec/nfcollector/date_partitioner_spec.rb +19 -0
- data/spec/nfcollector/input_definition_spec.rb +32 -0
- data/spec/nfcollector/mapping/column_transpiler_spec.rb +26 -0
- data/spec/nfcollector/mapping/output_spec.rb +76 -0
- data/spec/nfcollector/mapping/transpiler_spec.rb +47 -0
- data/spec/payload_job_spec.rb +11 -0
- data/spec/payload_processor_spec.rb +114 -0
- data/spec/spec_helper.rb +89 -0
- data/test/domains_hosts +194826 -0
- data/test/generate_input.rb +79 -0
- data/test/input/input-1000.csv +1000 -0
- data/test/input/input-100000.csv +100000 -0
- data/test/input/input-100000.dat +64039 -0
- data/test/input/input-no-tags.csv +3 -0
- data/test/input/input-no-tags.dat +3 -0
- data/test/input/input-no-tags.gz +0 -0
- data/test/input/input-with-tags.csv.gz +0 -0
- data/test/test_helper.rb +15 -0
- data/test/tester.rb +32 -0
- metadata +252 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
module Nfcollector
|
2
|
+
module Mapping
|
3
|
+
class Indexer
|
4
|
+
def initialize(account_id)
|
5
|
+
@users = {}
|
6
|
+
@account_id = account_id
|
7
|
+
end
|
8
|
+
|
9
|
+
def perform(mapped_row, indicies)
|
10
|
+
username, _ = mapped_row.values_at(*indicies)
|
11
|
+
# May seem clunky but hashes are much faster than Sets
|
12
|
+
# (plus we can store other things here later)
|
13
|
+
@users[username] = nil
|
14
|
+
end
|
15
|
+
|
16
|
+
def commit!
|
17
|
+
Configuration.user_committer.commit(@account_id, @users)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Nfcollector
|
2
|
+
module Mapping
|
3
|
+
class MappedRow < Array
|
4
|
+
attr_writer :category_ids
|
5
|
+
attr_accessor :id
|
6
|
+
|
7
|
+
def initialize(*args)
|
8
|
+
super()
|
9
|
+
concat(args)
|
10
|
+
end
|
11
|
+
|
12
|
+
def category_ids
|
13
|
+
@category_ids || []
|
14
|
+
end
|
15
|
+
|
16
|
+
def inspect
|
17
|
+
Array([@id, super]).compact.join(":")
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module Nfcollector
|
2
|
+
module Mapping
|
3
|
+
# TODO: Consider merging this and DefaultOutput
|
4
|
+
class Output
|
5
|
+
class << self
|
6
|
+
attr_accessor :categories_processor
|
7
|
+
|
8
|
+
def outputs
|
9
|
+
@output ||= []
|
10
|
+
end
|
11
|
+
|
12
|
+
def output(name, options = {})
|
13
|
+
self.outputs << new(name, options)
|
14
|
+
end
|
15
|
+
|
16
|
+
def categories(options = {})
|
17
|
+
self.categories_processor = CategoriesProcessor.new(options)
|
18
|
+
end
|
19
|
+
|
20
|
+
def build_transpiler(input_definition, column_options = {})
|
21
|
+
Transpiler.new(input_definition, column_options).tap do |transpiler|
|
22
|
+
outputs.each do |output|
|
23
|
+
transpiler.add_column(output)
|
24
|
+
end
|
25
|
+
transpiler.process_categories_with(self.categories_processor)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
attr_reader :name
|
31
|
+
attr_reader :inputs
|
32
|
+
attr_reader :process_with
|
33
|
+
|
34
|
+
def initialize(name, options = {})
|
35
|
+
@name = name.to_sym
|
36
|
+
@inputs = Array(options.fetch(:inputs, @name))
|
37
|
+
@process_with = options.fetch(:process_with, @name)
|
38
|
+
@pkey = options[:primary_key]
|
39
|
+
@optional = options[:optional]
|
40
|
+
end
|
41
|
+
|
42
|
+
def optional?
|
43
|
+
@optional == true
|
44
|
+
end
|
45
|
+
|
46
|
+
def required?
|
47
|
+
!optional?
|
48
|
+
end
|
49
|
+
|
50
|
+
def primary_key?
|
51
|
+
@pkey == true
|
52
|
+
end
|
53
|
+
|
54
|
+
def method_missing(method, arg, options = {})
|
55
|
+
return arg
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
|
2
|
+
module Nfcollector
|
3
|
+
module Mapping
|
4
|
+
class AfterRowHook
|
5
|
+
# Represents a hook that is called after each row has been transpiled
|
6
|
+
# @param [Object,Class] processor an object or class that responds to the perform method
|
7
|
+
# @param [Array<Integer>] an array of indicies from which the processing arguments are taken in the transpiled row
|
8
|
+
def initialize(processor, indicies)
|
9
|
+
@processor = processor
|
10
|
+
@indicies = indicies
|
11
|
+
end
|
12
|
+
|
13
|
+
def perform(mapped_row)
|
14
|
+
@processor.perform(mapped_row, @indicies)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
class Transpiler
|
19
|
+
attr_reader :outputs
|
20
|
+
|
21
|
+
def initialize(input_definition, column_options = {})
|
22
|
+
@input_definition = input_definition
|
23
|
+
@outputs = []
|
24
|
+
@after_row_hooks = []
|
25
|
+
@column_options = column_options
|
26
|
+
end
|
27
|
+
|
28
|
+
def headers
|
29
|
+
@outputs.map(&:name)
|
30
|
+
end
|
31
|
+
|
32
|
+
def add_column(output)
|
33
|
+
# TODO: Make this output.should_transpile?(@input_definition)
|
34
|
+
if output.inputs.blank? || @input_definition.has_index_for?(output.inputs) || output.required?
|
35
|
+
column_transpiler = ColumnTranspiler.new(output, @column_options)
|
36
|
+
column_transpiler.build!(@input_definition)
|
37
|
+
@outputs << column_transpiler
|
38
|
+
if output.primary_key?
|
39
|
+
@primary_key_index = @outputs.size - 1
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def process_categories_with(categories_processor)
|
45
|
+
@categories_processor = categories_processor
|
46
|
+
if @input_definition.has_index_for?(:category) || @categories_processor.required?
|
47
|
+
@category_index = @input_definition.column_index(:category)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def after_row(processor, options = {})
|
52
|
+
required_columns = options[:using]
|
53
|
+
indicies = Array(required_columns).map { |column| index_of(column) }
|
54
|
+
@after_row_hooks << AfterRowHook.new(processor, indicies)
|
55
|
+
end
|
56
|
+
|
57
|
+
# @param [Symbol] name of the column
|
58
|
+
# @return the index of the column name in the output
|
59
|
+
def index_of(column)
|
60
|
+
@outputs.index { |output| output.name == column }
|
61
|
+
end
|
62
|
+
|
63
|
+
def transpile(row)
|
64
|
+
MappedRow.new.tap do |out|
|
65
|
+
@outputs.each do |ct|
|
66
|
+
out << ct.go(row)
|
67
|
+
end
|
68
|
+
out.category_ids = process_categories(row)
|
69
|
+
out.id = primary_key(out)
|
70
|
+
process_after_row_hooks(out)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def process_after_row_hooks(mapped_row)
|
75
|
+
@after_row_hooks.each do |hook|
|
76
|
+
hook.perform(mapped_row)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def process_categories(row)
|
81
|
+
if @categories_processor.present? && @category_index.present?
|
82
|
+
@categories_processor.perform(row[@category_index])
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def primary_key(output_row)
|
87
|
+
return nil if @primary_key_index.blank?
|
88
|
+
output_row[@primary_key_index]
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
|
2
|
+
module Nfcollector
|
3
|
+
class Partition
|
4
|
+
attr_reader :rows
|
5
|
+
attr_reader :partition_id
|
6
|
+
attr_reader :headers
|
7
|
+
attr_reader :account_id
|
8
|
+
|
9
|
+
# Instantiates a partitioner that will generate partitions
|
10
|
+
# of this type
|
11
|
+
#
|
12
|
+
# @param [Integer,String] account_id the account_id for which we are processing data
|
13
|
+
# @param [Integer] partition_key_index the index in each row of the partition key
|
14
|
+
# @param [Array] headers an array of headers for writing the data to file
|
15
|
+
#
|
16
|
+
def self.get_partitioner(account_id, partition_key_index, headers)
|
17
|
+
Partitioner.new(self, account_id, partition_key_index).tap do |part|
|
18
|
+
part.set_headers!(headers)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# The ActiveRecord class used to back this partition type
|
23
|
+
class_attribute :ar_klass
|
24
|
+
|
25
|
+
# Create a new partition with the given values
|
26
|
+
# @param [Array] values
|
27
|
+
#
|
28
|
+
def initialize(value, account_id, headers)
|
29
|
+
@value = value
|
30
|
+
@account_id = account_id
|
31
|
+
@headers = headers
|
32
|
+
@rows = []
|
33
|
+
end
|
34
|
+
|
35
|
+
def partition_id
|
36
|
+
self.class.partition_id(@value)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Adds a row to the partition
|
40
|
+
# @param [MappedRow] row
|
41
|
+
#
|
42
|
+
def add_row(row)
|
43
|
+
# TODO: Is there a cleaner way to do this??
|
44
|
+
@rows << [ @account_id ] + row
|
45
|
+
end
|
46
|
+
|
47
|
+
# Return the table name that should be used for the data
|
48
|
+
# in this partition
|
49
|
+
# @param [Integer,String] account_id
|
50
|
+
#
|
51
|
+
def table_name
|
52
|
+
return @table_name if @table_name
|
53
|
+
partition = ar_klass.partitions.find_or_create_for(keys)
|
54
|
+
@table_name = partition.nil? ? ar_klass.table_name : partition.name
|
55
|
+
end
|
56
|
+
|
57
|
+
def file_name
|
58
|
+
@file_name ||= File.expand_path("#{table_name}_#{randstr}.copy", Configuration.output_dir)
|
59
|
+
end
|
60
|
+
|
61
|
+
# @return [Hash] a set of keys used to generate the table for this partition
|
62
|
+
#
|
63
|
+
def keys
|
64
|
+
raise NotImplemented
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
def randstr
|
69
|
+
"".tap do |str|
|
70
|
+
8.times do
|
71
|
+
str << ((rand * 25).to_i + 97).chr
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Nfcollector
|
2
|
+
class Partitioner
|
3
|
+
attr_reader :column_indexes
|
4
|
+
attr_reader :data
|
5
|
+
|
6
|
+
# TODO: use 2.1 named parameters when we can guarantee Ruby 2.1
|
7
|
+
def initialize(partition_klass, account_id, column_index)
|
8
|
+
@partition_klass = partition_klass
|
9
|
+
@account_id = account_id
|
10
|
+
@column_index = column_index
|
11
|
+
@data = {}
|
12
|
+
end
|
13
|
+
|
14
|
+
def set_headers!(headers)
|
15
|
+
@headers = headers
|
16
|
+
end
|
17
|
+
|
18
|
+
def add_row(row)
|
19
|
+
p_value = row_value(row)
|
20
|
+
p_id = @partition_klass.partition_id(p_value)
|
21
|
+
@data[p_id] ||= @partition_klass.new(p_value, @account_id, @headers)
|
22
|
+
@data[p_id].add_row(row)
|
23
|
+
end
|
24
|
+
|
25
|
+
def row_value(row)
|
26
|
+
row[@column_index]
|
27
|
+
end
|
28
|
+
|
29
|
+
def commit!(account_id)
|
30
|
+
@data.map do |(key,partition)|
|
31
|
+
CopyFileWriter.open(partition) do |copy|
|
32
|
+
copy.commit!
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module Nfcollector
|
2
|
+
class PayloadProcessor
|
3
|
+
|
4
|
+
def initialize(account_id, tz, attributes_string)
|
5
|
+
@account_id = account_id
|
6
|
+
@tz = tz
|
7
|
+
input_definition = Attributes.parse(attributes_string)
|
8
|
+
# TODO: Will need to specify the Output class to use later
|
9
|
+
@transpiler = Mapping::DefaultOutput.build_transpiler(input_definition, tz: @tz)
|
10
|
+
@categoriser = Categoriser.new(account_id)
|
11
|
+
@indexer = Mapping::Indexer.new(account_id)
|
12
|
+
# TODO: Do the category lookups inside the categoriser and pass it the categories string
|
13
|
+
# (Only do this if the input definition has the categories - actuall ALL the required inputs)
|
14
|
+
@transpiler.after_row(@categoriser, using: [ :domain, :host, :created_at ])
|
15
|
+
@transpiler.after_row(@indexer, using: [ :username ])
|
16
|
+
end
|
17
|
+
|
18
|
+
def process_payload(io)
|
19
|
+
# TODO: Look at streaming with GzipReader#read_partial later (not the lowest hanging fruit right now)
|
20
|
+
gz = Zlib::GzipReader.new(io)
|
21
|
+
strio = StringIO.new(gz.read)
|
22
|
+
gz.close
|
23
|
+
|
24
|
+
weblog_partitioner = WeblogPartition.get_partitioner(@account_id, @transpiler.index_of(:created_at), @transpiler.headers)
|
25
|
+
|
26
|
+
# Improvements for the future
|
27
|
+
# - Switched from excelsior to CSV because it was segfaulting under the latest Ruby
|
28
|
+
# - Ideally we would use ccsv here but it cannot read from an IO, only a file
|
29
|
+
# We could look at using sidekiq with Pure Ruby
|
30
|
+
# or have a job unzip to a second file first (not very memory efficient, and error prone)
|
31
|
+
# Or look at rewriting the collector in Go and use http://www.goworker.org/ (Ideal but lots of work)
|
32
|
+
#
|
33
|
+
csv = CSV.new(strio)
|
34
|
+
while !csv.eof?
|
35
|
+
mapped_row = @transpiler.transpile(csv.readline) # TODO: rename the transpile method
|
36
|
+
weblog_partitioner.add_row(mapped_row)
|
37
|
+
end
|
38
|
+
|
39
|
+
# TODO: These could write on the fly in time
|
40
|
+
cfws = weblog_partitioner.commit!(@account_id)
|
41
|
+
@categoriser.commit!
|
42
|
+
@indexer.commit!
|
43
|
+
return cfws
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
|
2
|
+
module Nfcollector
|
3
|
+
class WeblogPartition < Partition
|
4
|
+
|
5
|
+
# Calculates the ID for this partition
|
6
|
+
# Think of it as a statistical 'bin'
|
7
|
+
#
|
8
|
+
# In this case we take the date from a timestamp
|
9
|
+
#
|
10
|
+
def self.partition_id(value)
|
11
|
+
Date.new(value.year, value.month, value.mday)
|
12
|
+
end
|
13
|
+
|
14
|
+
def keys
|
15
|
+
{ account_id: account_id, created_at: range_from_date }
|
16
|
+
end
|
17
|
+
|
18
|
+
def range_from_date
|
19
|
+
date = partition_id
|
20
|
+
next_date = partition_id.next
|
21
|
+
start_time = Time.utc(date.year, date.month, date.mday)
|
22
|
+
end_time = Time.utc(next_date.year, next_date.month, next_date.mday)
|
23
|
+
start_time...end_time
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
data/nfcollector.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'nfcollector/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "nfcollector"
|
8
|
+
gem.version = Nfcollector::VERSION
|
9
|
+
gem.authors = ["Dan Draper"]
|
10
|
+
gem.email = ["daniel@codefire.com"]
|
11
|
+
gem.description = %q{Data collection system for NetFox}
|
12
|
+
gem.summary = %q{Data Collection}
|
13
|
+
gem.homepage = ""
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
|
20
|
+
gem.add_development_dependency 'test-unit'
|
21
|
+
gem.add_development_dependency 'rspec'
|
22
|
+
gem.add_development_dependency 'shoulda'
|
23
|
+
gem.add_development_dependency 'timecop'
|
24
|
+
gem.add_development_dependency 'differ'
|
25
|
+
gem.add_development_dependency 'byebug'
|
26
|
+
gem.add_development_dependency 'activerecord'
|
27
|
+
|
28
|
+
gem.add_runtime_dependency 'domainatrix', '>= 0.0.10'
|
29
|
+
gem.add_runtime_dependency 'activesupport', '>= 3.2'
|
30
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Nfcollector::AttributeValidator do
|
4
|
+
subject(:validator) { Nfcollector::AttributeValidator.new }
|
5
|
+
|
6
|
+
specify "raise if a duplicate attribute is provided" do
|
7
|
+
expect {
|
8
|
+
subject.validate(">a,Rr,Rh,Rp,>a,>A,t,Un,Hs,mt,<s,Cs")
|
9
|
+
}.to raise_error(Nfcollector::DuplicateAttribute)
|
10
|
+
end
|
11
|
+
|
12
|
+
specify "raise if an unknown attribute is provided" do
|
13
|
+
expect {
|
14
|
+
subject.validate(">a,Rr,Rh,Rp,>A,t,Un,Hs,mt,<s,Cs,GG")
|
15
|
+
}.to raise_error(Nfcollector::UnknownAttribute, "Unknown Attribute GG")
|
16
|
+
end
|
17
|
+
|
18
|
+
specify "raise if a required attribute is NOT provided" do
|
19
|
+
expect {
|
20
|
+
subject.validate(">a,Rr,Rh,Rp,>A,Un,Hs,mt,<s,Cs")
|
21
|
+
}.to raise_error(Nfcollector::MissingRequiredAttribute, "Missing attribute 't'")
|
22
|
+
end
|
23
|
+
end
|