nfcollector 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/.rspec +3 -0
  4. data/.rvmrc +1 -0
  5. data/Gemfile +8 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +29 -0
  8. data/Rakefile +1 -0
  9. data/lib/nfcollector.rb +41 -0
  10. data/lib/nfcollector/attribute_validator.rb +59 -0
  11. data/lib/nfcollector/attributes.rb +99 -0
  12. data/lib/nfcollector/categoriser.rb +43 -0
  13. data/lib/nfcollector/category_partition.rb +17 -0
  14. data/lib/nfcollector/configuration.rb +24 -0
  15. data/lib/nfcollector/copy_file_writer.rb +47 -0
  16. data/lib/nfcollector/domain_parser.rb +49 -0
  17. data/lib/nfcollector/input_definition.rb +31 -0
  18. data/lib/nfcollector/mapping.rb +7 -0
  19. data/lib/nfcollector/mapping/categories_processor.rb +36 -0
  20. data/lib/nfcollector/mapping/column_transpiler.rb +29 -0
  21. data/lib/nfcollector/mapping/default_output.rb +45 -0
  22. data/lib/nfcollector/mapping/effective_tld_names.dat +4394 -0
  23. data/lib/nfcollector/mapping/indexer.rb +21 -0
  24. data/lib/nfcollector/mapping/mapped_row.rb +21 -0
  25. data/lib/nfcollector/mapping/output.rb +59 -0
  26. data/lib/nfcollector/mapping/transpiler.rb +92 -0
  27. data/lib/nfcollector/nfcollector_exception.rb +4 -0
  28. data/lib/nfcollector/partition.rb +76 -0
  29. data/lib/nfcollector/partitioner.rb +37 -0
  30. data/lib/nfcollector/payload_processor.rb +46 -0
  31. data/lib/nfcollector/sequence_generator.rb +11 -0
  32. data/lib/nfcollector/version.rb +3 -0
  33. data/lib/nfcollector/weblog_partition.rb +26 -0
  34. data/nfcollector.gemspec +30 -0
  35. data/spec/attribute_validator_spec.rb +23 -0
  36. data/spec/attributes_spec.rb +15 -0
  37. data/spec/command_parser_spec.rb +81 -0
  38. data/spec/copy_file_writer_spec.rb +95 -0
  39. data/spec/input_definition_spec.rb +18 -0
  40. data/spec/nfcollector/category_partitioner_spec.rb +51 -0
  41. data/spec/nfcollector/date_partitioner_spec.rb +19 -0
  42. data/spec/nfcollector/input_definition_spec.rb +32 -0
  43. data/spec/nfcollector/mapping/column_transpiler_spec.rb +26 -0
  44. data/spec/nfcollector/mapping/output_spec.rb +76 -0
  45. data/spec/nfcollector/mapping/transpiler_spec.rb +47 -0
  46. data/spec/payload_job_spec.rb +11 -0
  47. data/spec/payload_processor_spec.rb +114 -0
  48. data/spec/spec_helper.rb +89 -0
  49. data/test/domains_hosts +194826 -0
  50. data/test/generate_input.rb +79 -0
  51. data/test/input/input-1000.csv +1000 -0
  52. data/test/input/input-100000.csv +100000 -0
  53. data/test/input/input-100000.dat +64039 -0
  54. data/test/input/input-no-tags.csv +3 -0
  55. data/test/input/input-no-tags.dat +3 -0
  56. data/test/input/input-no-tags.gz +0 -0
  57. data/test/input/input-with-tags.csv.gz +0 -0
  58. data/test/test_helper.rb +15 -0
  59. data/test/tester.rb +32 -0
  60. metadata +252 -0
@@ -0,0 +1,21 @@
1
+ module Nfcollector
2
+ module Mapping
3
+ class Indexer
4
+ def initialize(account_id)
5
+ @users = {}
6
+ @account_id = account_id
7
+ end
8
+
9
+ def perform(mapped_row, indicies)
10
+ username, _ = mapped_row.values_at(*indicies)
11
+ # May seem clunky but hashes are much faster than Sets
12
+ # (plus we can store other things here later)
13
+ @users[username] = nil
14
+ end
15
+
16
+ def commit!
17
+ Configuration.user_committer.commit(@account_id, @users)
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,21 @@
1
+ module Nfcollector
2
+ module Mapping
3
+ class MappedRow < Array
4
+ attr_writer :category_ids
5
+ attr_accessor :id
6
+
7
+ def initialize(*args)
8
+ super()
9
+ concat(args)
10
+ end
11
+
12
+ def category_ids
13
+ @category_ids || []
14
+ end
15
+
16
+ def inspect
17
+ Array([@id, super]).compact.join(":")
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,59 @@
1
+ module Nfcollector
2
+ module Mapping
3
+ # TODO: Consider merging this and DefaultOutput
4
+ class Output
5
+ class << self
6
+ attr_accessor :categories_processor
7
+
8
+ def outputs
9
+ @output ||= []
10
+ end
11
+
12
+ def output(name, options = {})
13
+ self.outputs << new(name, options)
14
+ end
15
+
16
+ def categories(options = {})
17
+ self.categories_processor = CategoriesProcessor.new(options)
18
+ end
19
+
20
+ def build_transpiler(input_definition, column_options = {})
21
+ Transpiler.new(input_definition, column_options).tap do |transpiler|
22
+ outputs.each do |output|
23
+ transpiler.add_column(output)
24
+ end
25
+ transpiler.process_categories_with(self.categories_processor)
26
+ end
27
+ end
28
+ end
29
+
30
+ attr_reader :name
31
+ attr_reader :inputs
32
+ attr_reader :process_with
33
+
34
+ def initialize(name, options = {})
35
+ @name = name.to_sym
36
+ @inputs = Array(options.fetch(:inputs, @name))
37
+ @process_with = options.fetch(:process_with, @name)
38
+ @pkey = options[:primary_key]
39
+ @optional = options[:optional]
40
+ end
41
+
42
+ def optional?
43
+ @optional == true
44
+ end
45
+
46
+ def required?
47
+ !optional?
48
+ end
49
+
50
+ def primary_key?
51
+ @pkey == true
52
+ end
53
+
54
+ def method_missing(method, arg, options = {})
55
+ return arg
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,92 @@
1
+
2
+ module Nfcollector
3
+ module Mapping
4
+ class AfterRowHook
5
+ # Represents a hook that is called after each row has been transpiled
6
+ # @param [Object,Class] processor an object or class that responds to the perform method
7
+ # @param [Array<Integer>] an array of indicies from which the processing arguments are taken in the transpiled row
8
+ def initialize(processor, indicies)
9
+ @processor = processor
10
+ @indicies = indicies
11
+ end
12
+
13
+ def perform(mapped_row)
14
+ @processor.perform(mapped_row, @indicies)
15
+ end
16
+ end
17
+
18
+ class Transpiler
19
+ attr_reader :outputs
20
+
21
+ def initialize(input_definition, column_options = {})
22
+ @input_definition = input_definition
23
+ @outputs = []
24
+ @after_row_hooks = []
25
+ @column_options = column_options
26
+ end
27
+
28
+ def headers
29
+ @outputs.map(&:name)
30
+ end
31
+
32
+ def add_column(output)
33
+ # TODO: Make this output.should_transpile?(@input_definition)
34
+ if output.inputs.blank? || @input_definition.has_index_for?(output.inputs) || output.required?
35
+ column_transpiler = ColumnTranspiler.new(output, @column_options)
36
+ column_transpiler.build!(@input_definition)
37
+ @outputs << column_transpiler
38
+ if output.primary_key?
39
+ @primary_key_index = @outputs.size - 1
40
+ end
41
+ end
42
+ end
43
+
44
+ def process_categories_with(categories_processor)
45
+ @categories_processor = categories_processor
46
+ if @input_definition.has_index_for?(:category) || @categories_processor.required?
47
+ @category_index = @input_definition.column_index(:category)
48
+ end
49
+ end
50
+
51
+ def after_row(processor, options = {})
52
+ required_columns = options[:using]
53
+ indicies = Array(required_columns).map { |column| index_of(column) }
54
+ @after_row_hooks << AfterRowHook.new(processor, indicies)
55
+ end
56
+
57
+ # @param [Symbol] name of the column
58
+ # @return the index of the column name in the output
59
+ def index_of(column)
60
+ @outputs.index { |output| output.name == column }
61
+ end
62
+
63
+ def transpile(row)
64
+ MappedRow.new.tap do |out|
65
+ @outputs.each do |ct|
66
+ out << ct.go(row)
67
+ end
68
+ out.category_ids = process_categories(row)
69
+ out.id = primary_key(out)
70
+ process_after_row_hooks(out)
71
+ end
72
+ end
73
+
74
+ def process_after_row_hooks(mapped_row)
75
+ @after_row_hooks.each do |hook|
76
+ hook.perform(mapped_row)
77
+ end
78
+ end
79
+
80
+ def process_categories(row)
81
+ if @categories_processor.present? && @category_index.present?
82
+ @categories_processor.perform(row[@category_index])
83
+ end
84
+ end
85
+
86
+ def primary_key(output_row)
87
+ return nil if @primary_key_index.blank?
88
+ output_row[@primary_key_index]
89
+ end
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,4 @@
1
+
2
+ module Nfcollector
3
+ class NfcollectorException < RuntimeError; end
4
+ end
@@ -0,0 +1,76 @@
1
+
2
+ module Nfcollector
3
+ class Partition
4
+ attr_reader :rows
5
+ attr_reader :partition_id
6
+ attr_reader :headers
7
+ attr_reader :account_id
8
+
9
+ # Instantiates a partitioner that will generate partitions
10
+ # of this type
11
+ #
12
+ # @param [Integer,String] account_id the account_id for which we are processing data
13
+ # @param [Integer] partition_key_index the index in each row of the partition key
14
+ # @param [Array] headers an array of headers for writing the data to file
15
+ #
16
+ def self.get_partitioner(account_id, partition_key_index, headers)
17
+ Partitioner.new(self, account_id, partition_key_index).tap do |part|
18
+ part.set_headers!(headers)
19
+ end
20
+ end
21
+
22
+ # The ActiveRecord class used to back this partition type
23
+ class_attribute :ar_klass
24
+
25
+ # Create a new partition with the given values
26
+ # @param [Array] values
27
+ #
28
+ def initialize(value, account_id, headers)
29
+ @value = value
30
+ @account_id = account_id
31
+ @headers = headers
32
+ @rows = []
33
+ end
34
+
35
+ def partition_id
36
+ self.class.partition_id(@value)
37
+ end
38
+
39
+ # Adds a row to the partition
40
+ # @param [MappedRow] row
41
+ #
42
+ def add_row(row)
43
+ # TODO: Is there a cleaner way to do this??
44
+ @rows << [ @account_id ] + row
45
+ end
46
+
47
+ # Return the table name that should be used for the data
48
+ # in this partition
49
+ # @param [Integer,String] account_id
50
+ #
51
+ def table_name
52
+ return @table_name if @table_name
53
+ partition = ar_klass.partitions.find_or_create_for(keys)
54
+ @table_name = partition.nil? ? ar_klass.table_name : partition.name
55
+ end
56
+
57
+ def file_name
58
+ @file_name ||= File.expand_path("#{table_name}_#{randstr}.copy", Configuration.output_dir)
59
+ end
60
+
61
+ # @return [Hash] a set of keys used to generate the table for this partition
62
+ #
63
+ def keys
64
+ raise NotImplemented
65
+ end
66
+
67
+ private
68
+ def randstr
69
+ "".tap do |str|
70
+ 8.times do
71
+ str << ((rand * 25).to_i + 97).chr
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,37 @@
1
+ module Nfcollector
2
+ class Partitioner
3
+ attr_reader :column_indexes
4
+ attr_reader :data
5
+
6
+ # TODO: use 2.1 named parameters when we can guarantee Ruby 2.1
7
+ def initialize(partition_klass, account_id, column_index)
8
+ @partition_klass = partition_klass
9
+ @account_id = account_id
10
+ @column_index = column_index
11
+ @data = {}
12
+ end
13
+
14
+ def set_headers!(headers)
15
+ @headers = headers
16
+ end
17
+
18
+ def add_row(row)
19
+ p_value = row_value(row)
20
+ p_id = @partition_klass.partition_id(p_value)
21
+ @data[p_id] ||= @partition_klass.new(p_value, @account_id, @headers)
22
+ @data[p_id].add_row(row)
23
+ end
24
+
25
+ def row_value(row)
26
+ row[@column_index]
27
+ end
28
+
29
+ def commit!(account_id)
30
+ @data.map do |(key,partition)|
31
+ CopyFileWriter.open(partition) do |copy|
32
+ copy.commit!
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,46 @@
1
+ module Nfcollector
2
+ class PayloadProcessor
3
+
4
+ def initialize(account_id, tz, attributes_string)
5
+ @account_id = account_id
6
+ @tz = tz
7
+ input_definition = Attributes.parse(attributes_string)
8
+ # TODO: Will need to specify the Output class to use later
9
+ @transpiler = Mapping::DefaultOutput.build_transpiler(input_definition, tz: @tz)
10
+ @categoriser = Categoriser.new(account_id)
11
+ @indexer = Mapping::Indexer.new(account_id)
12
+ # TODO: Do the category lookups inside the categoriser and pass it the categories string
13
+ # (Only do this if the input definition has the categories - actuall ALL the required inputs)
14
+ @transpiler.after_row(@categoriser, using: [ :domain, :host, :created_at ])
15
+ @transpiler.after_row(@indexer, using: [ :username ])
16
+ end
17
+
18
+ def process_payload(io)
19
+ # TODO: Look at streaming with GzipReader#read_partial later (not the lowest hanging fruit right now)
20
+ gz = Zlib::GzipReader.new(io)
21
+ strio = StringIO.new(gz.read)
22
+ gz.close
23
+
24
+ weblog_partitioner = WeblogPartition.get_partitioner(@account_id, @transpiler.index_of(:created_at), @transpiler.headers)
25
+
26
+ # Improvements for the future
27
+ # - Switched from excelsior to CSV because it was segfaulting under the latest Ruby
28
+ # - Ideally we would use ccsv here but it cannot read from an IO, only a file
29
+ # We could look at using sidekiq with Pure Ruby
30
+ # or have a job unzip to a second file first (not very memory efficient, and error prone)
31
+ # Or look at rewriting the collector in Go and use http://www.goworker.org/ (Ideal but lots of work)
32
+ #
33
+ csv = CSV.new(strio)
34
+ while !csv.eof?
35
+ mapped_row = @transpiler.transpile(csv.readline) # TODO: rename the transpile method
36
+ weblog_partitioner.add_row(mapped_row)
37
+ end
38
+
39
+ # TODO: These could write on the fly in time
40
+ cfws = weblog_partitioner.commit!(@account_id)
41
+ @categoriser.commit!
42
+ @indexer.commit!
43
+ return cfws
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,11 @@
1
+ module Nfcollector
2
+ class SequenceGenerator
3
+ def initialize
4
+ @start = 0 # TODO: This will have to be smarter (needs to keep a track of the last used value somewhere)
5
+ end
6
+
7
+ def next
8
+ @start += 1
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,3 @@
1
+ module Nfcollector
2
+ VERSION = "3.2.0"
3
+ end
@@ -0,0 +1,26 @@
1
+
2
+ module Nfcollector
3
+ class WeblogPartition < Partition
4
+
5
+ # Calculates the ID for this partition
6
+ # Think of it as a statistical 'bin'
7
+ #
8
+ # In this case we take the date from a timestamp
9
+ #
10
+ def self.partition_id(value)
11
+ Date.new(value.year, value.month, value.mday)
12
+ end
13
+
14
+ def keys
15
+ { account_id: account_id, created_at: range_from_date }
16
+ end
17
+
18
+ def range_from_date
19
+ date = partition_id
20
+ next_date = partition_id.next
21
+ start_time = Time.utc(date.year, date.month, date.mday)
22
+ end_time = Time.utc(next_date.year, next_date.month, next_date.mday)
23
+ start_time...end_time
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,30 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'nfcollector/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "nfcollector"
8
+ gem.version = Nfcollector::VERSION
9
+ gem.authors = ["Dan Draper"]
10
+ gem.email = ["daniel@codefire.com"]
11
+ gem.description = %q{Data collection system for NetFox}
12
+ gem.summary = %q{Data Collection}
13
+ gem.homepage = ""
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+
20
+ gem.add_development_dependency 'test-unit'
21
+ gem.add_development_dependency 'rspec'
22
+ gem.add_development_dependency 'shoulda'
23
+ gem.add_development_dependency 'timecop'
24
+ gem.add_development_dependency 'differ'
25
+ gem.add_development_dependency 'byebug'
26
+ gem.add_development_dependency 'activerecord'
27
+
28
+ gem.add_runtime_dependency 'domainatrix', '>= 0.0.10'
29
+ gem.add_runtime_dependency 'activesupport', '>= 3.2'
30
+ end
@@ -0,0 +1,23 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nfcollector::AttributeValidator do
4
+ subject(:validator) { Nfcollector::AttributeValidator.new }
5
+
6
+ specify "raise if a duplicate attribute is provided" do
7
+ expect {
8
+ subject.validate(">a,Rr,Rh,Rp,>a,>A,t,Un,Hs,mt,<s,Cs")
9
+ }.to raise_error(Nfcollector::DuplicateAttribute)
10
+ end
11
+
12
+ specify "raise if an unknown attribute is provided" do
13
+ expect {
14
+ subject.validate(">a,Rr,Rh,Rp,>A,t,Un,Hs,mt,<s,Cs,GG")
15
+ }.to raise_error(Nfcollector::UnknownAttribute, "Unknown Attribute GG")
16
+ end
17
+
18
+ specify "raise if a required attribute is NOT provided" do
19
+ expect {
20
+ subject.validate(">a,Rr,Rh,Rp,>A,Un,Hs,mt,<s,Cs")
21
+ }.to raise_error(Nfcollector::MissingRequiredAttribute, "Missing attribute 't'")
22
+ end
23
+ end