nfcollector 3.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/.rspec +3 -0
  4. data/.rvmrc +1 -0
  5. data/Gemfile +8 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +29 -0
  8. data/Rakefile +1 -0
  9. data/lib/nfcollector.rb +41 -0
  10. data/lib/nfcollector/attribute_validator.rb +59 -0
  11. data/lib/nfcollector/attributes.rb +99 -0
  12. data/lib/nfcollector/categoriser.rb +43 -0
  13. data/lib/nfcollector/category_partition.rb +17 -0
  14. data/lib/nfcollector/configuration.rb +24 -0
  15. data/lib/nfcollector/copy_file_writer.rb +47 -0
  16. data/lib/nfcollector/domain_parser.rb +49 -0
  17. data/lib/nfcollector/input_definition.rb +31 -0
  18. data/lib/nfcollector/mapping.rb +7 -0
  19. data/lib/nfcollector/mapping/categories_processor.rb +36 -0
  20. data/lib/nfcollector/mapping/column_transpiler.rb +29 -0
  21. data/lib/nfcollector/mapping/default_output.rb +45 -0
  22. data/lib/nfcollector/mapping/effective_tld_names.dat +4394 -0
  23. data/lib/nfcollector/mapping/indexer.rb +21 -0
  24. data/lib/nfcollector/mapping/mapped_row.rb +21 -0
  25. data/lib/nfcollector/mapping/output.rb +59 -0
  26. data/lib/nfcollector/mapping/transpiler.rb +92 -0
  27. data/lib/nfcollector/nfcollector_exception.rb +4 -0
  28. data/lib/nfcollector/partition.rb +76 -0
  29. data/lib/nfcollector/partitioner.rb +37 -0
  30. data/lib/nfcollector/payload_processor.rb +46 -0
  31. data/lib/nfcollector/sequence_generator.rb +11 -0
  32. data/lib/nfcollector/version.rb +3 -0
  33. data/lib/nfcollector/weblog_partition.rb +26 -0
  34. data/nfcollector.gemspec +30 -0
  35. data/spec/attribute_validator_spec.rb +23 -0
  36. data/spec/attributes_spec.rb +15 -0
  37. data/spec/command_parser_spec.rb +81 -0
  38. data/spec/copy_file_writer_spec.rb +95 -0
  39. data/spec/input_definition_spec.rb +18 -0
  40. data/spec/nfcollector/category_partitioner_spec.rb +51 -0
  41. data/spec/nfcollector/date_partitioner_spec.rb +19 -0
  42. data/spec/nfcollector/input_definition_spec.rb +32 -0
  43. data/spec/nfcollector/mapping/column_transpiler_spec.rb +26 -0
  44. data/spec/nfcollector/mapping/output_spec.rb +76 -0
  45. data/spec/nfcollector/mapping/transpiler_spec.rb +47 -0
  46. data/spec/payload_job_spec.rb +11 -0
  47. data/spec/payload_processor_spec.rb +114 -0
  48. data/spec/spec_helper.rb +89 -0
  49. data/test/domains_hosts +194826 -0
  50. data/test/generate_input.rb +79 -0
  51. data/test/input/input-1000.csv +1000 -0
  52. data/test/input/input-100000.csv +100000 -0
  53. data/test/input/input-100000.dat +64039 -0
  54. data/test/input/input-no-tags.csv +3 -0
  55. data/test/input/input-no-tags.dat +3 -0
  56. data/test/input/input-no-tags.gz +0 -0
  57. data/test/input/input-with-tags.csv.gz +0 -0
  58. data/test/test_helper.rb +15 -0
  59. data/test/tester.rb +32 -0
  60. metadata +252 -0
@@ -0,0 +1,21 @@
1
+ module Nfcollector
2
+ module Mapping
3
+ class Indexer
4
+ def initialize(account_id)
5
+ @users = {}
6
+ @account_id = account_id
7
+ end
8
+
9
+ def perform(mapped_row, indicies)
10
+ username, _ = mapped_row.values_at(*indicies)
11
+ # May seem clunky but hashes are much faster than Sets
12
+ # (plus we can store other things here later)
13
+ @users[username] = nil
14
+ end
15
+
16
+ def commit!
17
+ Configuration.user_committer.commit(@account_id, @users)
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,21 @@
1
+ module Nfcollector
2
+ module Mapping
3
+ class MappedRow < Array
4
+ attr_writer :category_ids
5
+ attr_accessor :id
6
+
7
+ def initialize(*args)
8
+ super()
9
+ concat(args)
10
+ end
11
+
12
+ def category_ids
13
+ @category_ids || []
14
+ end
15
+
16
+ def inspect
17
+ Array([@id, super]).compact.join(":")
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,59 @@
1
+ module Nfcollector
2
+ module Mapping
3
+ # TODO: Consider merging this and DefaultOutput
4
+ class Output
5
+ class << self
6
+ attr_accessor :categories_processor
7
+
8
+ def outputs
9
+ @output ||= []
10
+ end
11
+
12
+ def output(name, options = {})
13
+ self.outputs << new(name, options)
14
+ end
15
+
16
+ def categories(options = {})
17
+ self.categories_processor = CategoriesProcessor.new(options)
18
+ end
19
+
20
+ def build_transpiler(input_definition, column_options = {})
21
+ Transpiler.new(input_definition, column_options).tap do |transpiler|
22
+ outputs.each do |output|
23
+ transpiler.add_column(output)
24
+ end
25
+ transpiler.process_categories_with(self.categories_processor)
26
+ end
27
+ end
28
+ end
29
+
30
+ attr_reader :name
31
+ attr_reader :inputs
32
+ attr_reader :process_with
33
+
34
+ def initialize(name, options = {})
35
+ @name = name.to_sym
36
+ @inputs = Array(options.fetch(:inputs, @name))
37
+ @process_with = options.fetch(:process_with, @name)
38
+ @pkey = options[:primary_key]
39
+ @optional = options[:optional]
40
+ end
41
+
42
+ def optional?
43
+ @optional == true
44
+ end
45
+
46
+ def required?
47
+ !optional?
48
+ end
49
+
50
+ def primary_key?
51
+ @pkey == true
52
+ end
53
+
54
+ def method_missing(method, arg, options = {})
55
+ return arg
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,92 @@
1
+
2
+ module Nfcollector
3
+ module Mapping
4
+ class AfterRowHook
5
+ # Represents a hook that is called after each row has been transpiled
6
+ # @param [Object,Class] processor an object or class that responds to the perform method
7
+ # @param [Array<Integer>] an array of indicies from which the processing arguments are taken in the transpiled row
8
+ def initialize(processor, indicies)
9
+ @processor = processor
10
+ @indicies = indicies
11
+ end
12
+
13
+ def perform(mapped_row)
14
+ @processor.perform(mapped_row, @indicies)
15
+ end
16
+ end
17
+
18
+ class Transpiler
19
+ attr_reader :outputs
20
+
21
+ def initialize(input_definition, column_options = {})
22
+ @input_definition = input_definition
23
+ @outputs = []
24
+ @after_row_hooks = []
25
+ @column_options = column_options
26
+ end
27
+
28
+ def headers
29
+ @outputs.map(&:name)
30
+ end
31
+
32
+ def add_column(output)
33
+ # TODO: Make this output.should_transpile?(@input_definition)
34
+ if output.inputs.blank? || @input_definition.has_index_for?(output.inputs) || output.required?
35
+ column_transpiler = ColumnTranspiler.new(output, @column_options)
36
+ column_transpiler.build!(@input_definition)
37
+ @outputs << column_transpiler
38
+ if output.primary_key?
39
+ @primary_key_index = @outputs.size - 1
40
+ end
41
+ end
42
+ end
43
+
44
+ def process_categories_with(categories_processor)
45
+ @categories_processor = categories_processor
46
+ if @input_definition.has_index_for?(:category) || @categories_processor.required?
47
+ @category_index = @input_definition.column_index(:category)
48
+ end
49
+ end
50
+
51
+ def after_row(processor, options = {})
52
+ required_columns = options[:using]
53
+ indicies = Array(required_columns).map { |column| index_of(column) }
54
+ @after_row_hooks << AfterRowHook.new(processor, indicies)
55
+ end
56
+
57
+ # @param [Symbol] name of the column
58
+ # @return the index of the column name in the output
59
+ def index_of(column)
60
+ @outputs.index { |output| output.name == column }
61
+ end
62
+
63
+ def transpile(row)
64
+ MappedRow.new.tap do |out|
65
+ @outputs.each do |ct|
66
+ out << ct.go(row)
67
+ end
68
+ out.category_ids = process_categories(row)
69
+ out.id = primary_key(out)
70
+ process_after_row_hooks(out)
71
+ end
72
+ end
73
+
74
+ def process_after_row_hooks(mapped_row)
75
+ @after_row_hooks.each do |hook|
76
+ hook.perform(mapped_row)
77
+ end
78
+ end
79
+
80
+ def process_categories(row)
81
+ if @categories_processor.present? && @category_index.present?
82
+ @categories_processor.perform(row[@category_index])
83
+ end
84
+ end
85
+
86
+ def primary_key(output_row)
87
+ return nil if @primary_key_index.blank?
88
+ output_row[@primary_key_index]
89
+ end
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,4 @@
1
+
2
+ module Nfcollector
3
+ class NfcollectorException < RuntimeError; end
4
+ end
@@ -0,0 +1,76 @@
1
+
2
+ module Nfcollector
3
+ class Partition
4
+ attr_reader :rows
5
+ attr_reader :partition_id
6
+ attr_reader :headers
7
+ attr_reader :account_id
8
+
9
+ # Instantiates a partitioner that will generate partitions
10
+ # of this type
11
+ #
12
+ # @param [Integer,String] account_id the account_id for which we are processing data
13
+ # @param [Integer] partition_key_index the index in each row of the partition key
14
+ # @param [Array] headers an array of headers for writing the data to file
15
+ #
16
+ def self.get_partitioner(account_id, partition_key_index, headers)
17
+ Partitioner.new(self, account_id, partition_key_index).tap do |part|
18
+ part.set_headers!(headers)
19
+ end
20
+ end
21
+
22
+ # The ActiveRecord class used to back this partition type
23
+ class_attribute :ar_klass
24
+
25
+ # Create a new partition with the given values
26
+ # @param [Array] values
27
+ #
28
+ def initialize(value, account_id, headers)
29
+ @value = value
30
+ @account_id = account_id
31
+ @headers = headers
32
+ @rows = []
33
+ end
34
+
35
+ def partition_id
36
+ self.class.partition_id(@value)
37
+ end
38
+
39
+ # Adds a row to the partition
40
+ # @param [MappedRow] row
41
+ #
42
+ def add_row(row)
43
+ # TODO: Is there a cleaner way to do this??
44
+ @rows << [ @account_id ] + row
45
+ end
46
+
47
+ # Return the table name that should be used for the data
48
+ # in this partition
49
+ # @param [Integer,String] account_id
50
+ #
51
+ def table_name
52
+ return @table_name if @table_name
53
+ partition = ar_klass.partitions.find_or_create_for(keys)
54
+ @table_name = partition.nil? ? ar_klass.table_name : partition.name
55
+ end
56
+
57
+ def file_name
58
+ @file_name ||= File.expand_path("#{table_name}_#{randstr}.copy", Configuration.output_dir)
59
+ end
60
+
61
+ # @return [Hash] a set of keys used to generate the table for this partition
62
+ #
63
+ def keys
64
+ raise NotImplemented
65
+ end
66
+
67
+ private
68
+ def randstr
69
+ "".tap do |str|
70
+ 8.times do
71
+ str << ((rand * 25).to_i + 97).chr
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,37 @@
1
+ module Nfcollector
2
+ class Partitioner
3
+ attr_reader :column_indexes
4
+ attr_reader :data
5
+
6
+ # TODO: use 2.1 named parameters when we can guarantee Ruby 2.1
7
+ def initialize(partition_klass, account_id, column_index)
8
+ @partition_klass = partition_klass
9
+ @account_id = account_id
10
+ @column_index = column_index
11
+ @data = {}
12
+ end
13
+
14
+ def set_headers!(headers)
15
+ @headers = headers
16
+ end
17
+
18
+ def add_row(row)
19
+ p_value = row_value(row)
20
+ p_id = @partition_klass.partition_id(p_value)
21
+ @data[p_id] ||= @partition_klass.new(p_value, @account_id, @headers)
22
+ @data[p_id].add_row(row)
23
+ end
24
+
25
+ def row_value(row)
26
+ row[@column_index]
27
+ end
28
+
29
+ def commit!(account_id)
30
+ @data.map do |(key,partition)|
31
+ CopyFileWriter.open(partition) do |copy|
32
+ copy.commit!
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,46 @@
1
+ module Nfcollector
2
+ class PayloadProcessor
3
+
4
+ def initialize(account_id, tz, attributes_string)
5
+ @account_id = account_id
6
+ @tz = tz
7
+ input_definition = Attributes.parse(attributes_string)
8
+ # TODO: Will need to specify the Output class to use later
9
+ @transpiler = Mapping::DefaultOutput.build_transpiler(input_definition, tz: @tz)
10
+ @categoriser = Categoriser.new(account_id)
11
+ @indexer = Mapping::Indexer.new(account_id)
12
+ # TODO: Do the category lookups inside the categoriser and pass it the categories string
13
+ # (Only do this if the input definition has the categories - actuall ALL the required inputs)
14
+ @transpiler.after_row(@categoriser, using: [ :domain, :host, :created_at ])
15
+ @transpiler.after_row(@indexer, using: [ :username ])
16
+ end
17
+
18
+ def process_payload(io)
19
+ # TODO: Look at streaming with GzipReader#read_partial later (not the lowest hanging fruit right now)
20
+ gz = Zlib::GzipReader.new(io)
21
+ strio = StringIO.new(gz.read)
22
+ gz.close
23
+
24
+ weblog_partitioner = WeblogPartition.get_partitioner(@account_id, @transpiler.index_of(:created_at), @transpiler.headers)
25
+
26
+ # Improvements for the future
27
+ # - Switched from excelsior to CSV because it was segfaulting under the latest Ruby
28
+ # - Ideally we would use ccsv here but it cannot read from an IO, only a file
29
+ # We could look at using sidekiq with Pure Ruby
30
+ # or have a job unzip to a second file first (not very memory efficient, and error prone)
31
+ # Or look at rewriting the collector in Go and use http://www.goworker.org/ (Ideal but lots of work)
32
+ #
33
+ csv = CSV.new(strio)
34
+ while !csv.eof?
35
+ mapped_row = @transpiler.transpile(csv.readline) # TODO: rename the transpile method
36
+ weblog_partitioner.add_row(mapped_row)
37
+ end
38
+
39
+ # TODO: These could write on the fly in time
40
+ cfws = weblog_partitioner.commit!(@account_id)
41
+ @categoriser.commit!
42
+ @indexer.commit!
43
+ return cfws
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,11 @@
1
+ module Nfcollector
2
+ class SequenceGenerator
3
+ def initialize
4
+ @start = 0 # TODO: This will have to be smarter (needs to keep a track of the last used value somewhere)
5
+ end
6
+
7
+ def next
8
+ @start += 1
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,3 @@
1
+ module Nfcollector
2
+ VERSION = "3.2.0"
3
+ end
@@ -0,0 +1,26 @@
1
+
2
+ module Nfcollector
3
+ class WeblogPartition < Partition
4
+
5
+ # Calculates the ID for this partition
6
+ # Think of it as a statistical 'bin'
7
+ #
8
+ # In this case we take the date from a timestamp
9
+ #
10
+ def self.partition_id(value)
11
+ Date.new(value.year, value.month, value.mday)
12
+ end
13
+
14
+ def keys
15
+ { account_id: account_id, created_at: range_from_date }
16
+ end
17
+
18
+ def range_from_date
19
+ date = partition_id
20
+ next_date = partition_id.next
21
+ start_time = Time.utc(date.year, date.month, date.mday)
22
+ end_time = Time.utc(next_date.year, next_date.month, next_date.mday)
23
+ start_time...end_time
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,30 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'nfcollector/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "nfcollector"
8
+ gem.version = Nfcollector::VERSION
9
+ gem.authors = ["Dan Draper"]
10
+ gem.email = ["daniel@codefire.com"]
11
+ gem.description = %q{Data collection system for NetFox}
12
+ gem.summary = %q{Data Collection}
13
+ gem.homepage = ""
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+
20
+ gem.add_development_dependency 'test-unit'
21
+ gem.add_development_dependency 'rspec'
22
+ gem.add_development_dependency 'shoulda'
23
+ gem.add_development_dependency 'timecop'
24
+ gem.add_development_dependency 'differ'
25
+ gem.add_development_dependency 'byebug'
26
+ gem.add_development_dependency 'activerecord'
27
+
28
+ gem.add_runtime_dependency 'domainatrix', '>= 0.0.10'
29
+ gem.add_runtime_dependency 'activesupport', '>= 3.2'
30
+ end
@@ -0,0 +1,23 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nfcollector::AttributeValidator do
4
+ subject(:validator) { Nfcollector::AttributeValidator.new }
5
+
6
+ specify "raise if a duplicate attribute is provided" do
7
+ expect {
8
+ subject.validate(">a,Rr,Rh,Rp,>a,>A,t,Un,Hs,mt,<s,Cs")
9
+ }.to raise_error(Nfcollector::DuplicateAttribute)
10
+ end
11
+
12
+ specify "raise if an unknown attribute is provided" do
13
+ expect {
14
+ subject.validate(">a,Rr,Rh,Rp,>A,t,Un,Hs,mt,<s,Cs,GG")
15
+ }.to raise_error(Nfcollector::UnknownAttribute, "Unknown Attribute GG")
16
+ end
17
+
18
+ specify "raise if a required attribute is NOT provided" do
19
+ expect {
20
+ subject.validate(">a,Rr,Rh,Rp,>A,Un,Hs,mt,<s,Cs")
21
+ }.to raise_error(Nfcollector::MissingRequiredAttribute, "Missing attribute 't'")
22
+ end
23
+ end