nfcollector 3.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/.rspec +3 -0
  4. data/.rvmrc +1 -0
  5. data/Gemfile +8 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +29 -0
  8. data/Rakefile +1 -0
  9. data/lib/nfcollector.rb +41 -0
  10. data/lib/nfcollector/attribute_validator.rb +59 -0
  11. data/lib/nfcollector/attributes.rb +99 -0
  12. data/lib/nfcollector/categoriser.rb +43 -0
  13. data/lib/nfcollector/category_partition.rb +17 -0
  14. data/lib/nfcollector/configuration.rb +24 -0
  15. data/lib/nfcollector/copy_file_writer.rb +47 -0
  16. data/lib/nfcollector/domain_parser.rb +49 -0
  17. data/lib/nfcollector/input_definition.rb +31 -0
  18. data/lib/nfcollector/mapping.rb +7 -0
  19. data/lib/nfcollector/mapping/categories_processor.rb +36 -0
  20. data/lib/nfcollector/mapping/column_transpiler.rb +29 -0
  21. data/lib/nfcollector/mapping/default_output.rb +45 -0
  22. data/lib/nfcollector/mapping/effective_tld_names.dat +4394 -0
  23. data/lib/nfcollector/mapping/indexer.rb +21 -0
  24. data/lib/nfcollector/mapping/mapped_row.rb +21 -0
  25. data/lib/nfcollector/mapping/output.rb +59 -0
  26. data/lib/nfcollector/mapping/transpiler.rb +92 -0
  27. data/lib/nfcollector/nfcollector_exception.rb +4 -0
  28. data/lib/nfcollector/partition.rb +76 -0
  29. data/lib/nfcollector/partitioner.rb +37 -0
  30. data/lib/nfcollector/payload_processor.rb +46 -0
  31. data/lib/nfcollector/sequence_generator.rb +11 -0
  32. data/lib/nfcollector/version.rb +3 -0
  33. data/lib/nfcollector/weblog_partition.rb +26 -0
  34. data/nfcollector.gemspec +30 -0
  35. data/spec/attribute_validator_spec.rb +23 -0
  36. data/spec/attributes_spec.rb +15 -0
  37. data/spec/command_parser_spec.rb +81 -0
  38. data/spec/copy_file_writer_spec.rb +95 -0
  39. data/spec/input_definition_spec.rb +18 -0
  40. data/spec/nfcollector/category_partitioner_spec.rb +51 -0
  41. data/spec/nfcollector/date_partitioner_spec.rb +19 -0
  42. data/spec/nfcollector/input_definition_spec.rb +32 -0
  43. data/spec/nfcollector/mapping/column_transpiler_spec.rb +26 -0
  44. data/spec/nfcollector/mapping/output_spec.rb +76 -0
  45. data/spec/nfcollector/mapping/transpiler_spec.rb +47 -0
  46. data/spec/payload_job_spec.rb +11 -0
  47. data/spec/payload_processor_spec.rb +114 -0
  48. data/spec/spec_helper.rb +89 -0
  49. data/test/domains_hosts +194826 -0
  50. data/test/generate_input.rb +79 -0
  51. data/test/input/input-1000.csv +1000 -0
  52. data/test/input/input-100000.csv +100000 -0
  53. data/test/input/input-100000.dat +64039 -0
  54. data/test/input/input-no-tags.csv +3 -0
  55. data/test/input/input-no-tags.dat +3 -0
  56. data/test/input/input-no-tags.gz +0 -0
  57. data/test/input/input-with-tags.csv.gz +0 -0
  58. data/test/test_helper.rb +15 -0
  59. data/test/tester.rb +32 -0
  60. metadata +252 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 00d69574752ea8a74595a80a94dbbe8120d05fd1
4
+ data.tar.gz: 1411de4b5124e5b1841543b6dade0461b9b7e57e
5
+ SHA512:
6
+ metadata.gz: 6a4ddd1387607a78ba9f8b3c86e83e73b2c7aff2ecdf0f927e171e968571d68ac3d76c814583873abdf81a17adf931f7c797089b5942dd6c40efead201c6bc36
7
+ data.tar.gz: 602f74db253c3f55c60984dc4f3286bc8224ee071ea07f4aace9a78c6c8fc911ff2cff2b5acfc8126c65f8867d65b2ac2ba86e72b626f041fc916519765f9685
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --color
2
+ --warnings
3
+ --require spec_helper
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm use 2.1.2@nfcollector2 --create
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in nfcollector.gemspec
4
+ gemspec
5
+
6
+ gem 'rspec'
7
+
8
+ gem 'rspecify'
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Dan Draper
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # Nfcollector
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'nfcollector'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install nfcollector
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,41 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ require 'rubygems'
5
+ require "bundler/setup"
6
+ require 'socket'
7
+ require 'csv'
8
+ require 'zlib'
9
+ require 'time'
10
+ require 'domainatrix'
11
+ require 'base64'
12
+ require 'active_support/core_ext/string/conversions'
13
+ require 'active_support/core_ext/object/with_options'
14
+ require 'active_support/core_ext/class/attribute'
15
+
16
+ require 'nfcollector/configuration'
17
+ require 'nfcollector/input_definition'
18
+
19
+ require 'nfcollector/mapping'
20
+ require 'nfcollector/sequence_generator'
21
+ require 'nfcollector/nfcollector_exception'
22
+ require 'nfcollector/attribute_validator'
23
+ require 'nfcollector/attributes'
24
+ require 'nfcollector/categoriser'
25
+ require 'nfcollector/copy_file_writer'
26
+ require 'nfcollector/domain_parser'
27
+ require 'nfcollector/partitioner'
28
+ require 'nfcollector/partition'
29
+ require 'nfcollector/category_partition'
30
+ require 'nfcollector/weblog_partition'
31
+ require 'nfcollector/payload_processor'
32
+
33
+ module Nfcollector
34
+ def self.config
35
+ Configuration.tap do |config|
36
+ yield config if block_given?
37
+ end
38
+ end
39
+ end
40
+
41
+ Nfcollector.config.set_defaults!
@@ -0,0 +1,59 @@
1
+ module Nfcollector
2
+ class MissingRequiredAttribute < NfcollectorException
3
+ attr_reader :attr
4
+
5
+ def initialize(attr)
6
+ @attr = attr
7
+ super
8
+ end
9
+
10
+ def message
11
+ "Missing attribute '#{attr}'"
12
+ end
13
+ end
14
+
15
+ class DuplicateAttribute < NfcollectorException
16
+ def message
17
+ "Duplicate attribute detected"
18
+ end
19
+ end
20
+
21
+ class UnknownAttribute < NfcollectorException
22
+ def message
23
+ "Unknown Attribute #{super}"
24
+ end
25
+ end
26
+
27
+ class AttributeValidator
28
+ def initialize(attributes)
29
+ @attributes = attributes
30
+ end
31
+
32
+ # Takes the Attribute String
33
+ def validate!
34
+ attrs = @attributes.split(",")
35
+
36
+ # Known Attrs
37
+ attrs.each do |attr|
38
+ unless Attributes::MAPPINGS.keys.include?(attr)
39
+ raise UnknownAttribute.new(attr)
40
+ end
41
+ end
42
+
43
+ # Dupes
44
+ attrs.inject([]) do |accum, attr|
45
+ raise DuplicateAttribute.new(attr) if accum.include?(attr)
46
+ accum << attr
47
+ end
48
+
49
+ # Required Attrs
50
+ Attributes::REQUIRED.each do |attr|
51
+ raise MissingRequiredAttribute.new(attr) unless attrs.include?(attr)
52
+ end
53
+ end
54
+
55
+ def self.validate!(attributes)
56
+ self.new(attributes).validate!
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,99 @@
1
+ module Nfcollector
2
+ class Attributes
3
+
4
+ ATTR_CREATED_AT = 't'
5
+ ATTR_CLIENT_IP = '>a'
6
+ ATTR_CACHED = 'Cs'
7
+ ATTR_HTTP_RESP = 'Hs'
8
+ ATTR_BYTES = '<s'
9
+ ATTR_HOST = 'Rh'
10
+ ATTR_PATH = 'Rp'
11
+ ATTR_USERNAME = 'Un'
12
+ ATTR_MIME_TYPE = 'mt'
13
+ ATTR_REQUEST_RESPONSE = 'Rr'
14
+ ATTR_USER_GROUP = 'Ug'
15
+ ATTR_CLIENT_FQDN = '>A'
16
+ ATTR_COMPUTER_GROUP = '>G'
17
+ ATTR_CATEGORY = 'Rc'
18
+
19
+ # Input Attributes mapped to their column or map reduce names
20
+ MAPPINGS = {
21
+ ATTR_CREATED_AT => :created_at,
22
+ ATTR_CLIENT_IP => :client_ip,
23
+ ATTR_CACHED => :cached,
24
+ ATTR_HTTP_RESP => :http_resp_code,
25
+ ATTR_BYTES => :bytes,
26
+ ATTR_HOST => :host,
27
+ ATTR_PATH => :path,
28
+ ATTR_USERNAME => :username,
29
+ ATTR_MIME_TYPE => :mime_type,
30
+ ATTR_REQUEST_RESPONSE => :request_response,
31
+ ATTR_USER_GROUP => :user_group,
32
+ ATTR_CLIENT_FQDN => :client_fqdn,
33
+ ATTR_COMPUTER_GROUP => :computer_group,
34
+ ATTR_CATEGORY => :category
35
+ }
36
+
37
+ COPY_FILE_COLUMNS = [
38
+ :created_at,
39
+ :client_ip,
40
+ :cached,
41
+ :http_resp_code,
42
+ :bytes,
43
+ :host,
44
+ :path,
45
+ :username,
46
+ :mime_type,
47
+ :request_response,
48
+ :user_group,
49
+ :client_fqdn,
50
+ :computer_group,
51
+ :domain
52
+ ]
53
+
54
+ MAP_REDUCE_COLUMNS = [
55
+ :created_at,
56
+ :bytes,
57
+ :host,
58
+ :username,
59
+ :domain,
60
+ :category
61
+ ]
62
+
63
+ REQUIRED = [
64
+ ATTR_CREATED_AT,
65
+ ATTR_CLIENT_IP,
66
+ ATTR_CACHED,
67
+ ATTR_HTTP_RESP,
68
+ ATTR_BYTES,
69
+ ATTR_HOST,
70
+ ATTR_PATH,
71
+ ATTR_USERNAME,
72
+ ATTR_MIME_TYPE
73
+ ]
74
+
75
+ # Selects from an array of column names
76
+ # only the ones that can be used for copy files
77
+ def self.for_copy_file(arr)
78
+ # TODO: No longer needed?
79
+ arr.select { |item|
80
+ COPY_FILE_COLUMNS.include?(item.to_sym)
81
+ }.map(&:to_sym)
82
+ end
83
+
84
+ # Parses a comma separated list of attrs into
85
+ # an array of column names as symbols
86
+ def self.parse(attributes_string)
87
+ InputDefinition.new.tap do |definition|
88
+ attributes_string.split(',').each_with_index do |attr, index|
89
+ definition.set(index, MAPPINGS[attr])
90
+ end
91
+ end
92
+ end
93
+
94
+ # Returns the column name associated with the given attr code
95
+ def self.map(attr_code)
96
+ MAPPINGS[attr_code.to_s]
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,43 @@
1
+ module Nfcollector
2
+ class Categoriser
3
+ class Entry
4
+ attr_reader :last_seen
5
+ attr_reader :hits
6
+ attr_reader :categories
7
+
8
+ def initialize
9
+ @hits = 0
10
+ @last_seen = Time.at(0)
11
+ @categories = Set.new
12
+ end
13
+
14
+ def update(last_seen, category_ids)
15
+ @hits += 1
16
+ @last_seen = last_seen if @last_seen < last_seen
17
+ @categories.merge(category_ids)
18
+ end
19
+ end
20
+
21
+ def initialize(account_id)
22
+ @account_id = account_id
23
+ @domains = {}
24
+ @hosts = {}
25
+ end
26
+
27
+ #def perform(domain, host, last_seen)
28
+ def perform(mapped_row, indicies)
29
+ domain, host, last_seen = mapped_row.values_at(*indicies)
30
+ @domains[domain] ||= Entry.new
31
+ @hosts[host] ||= Entry.new
32
+ # TODO: This is doing TWO lookups
33
+ @domains[domain].update(last_seen, mapped_row.category_ids)
34
+ @hosts[host].update(last_seen, mapped_row.category_ids)
35
+ end
36
+
37
+ def commit!
38
+ # TODO: This feels a bit clunky...
39
+ Configuration.categorisation_domains_committer.commit(@account_id, @domains)
40
+ Configuration.categorisation_hosts_committer.commit(@account_id, @hosts)
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,17 @@
1
+
2
+ module Nfcollector
3
+ class CategoryPartition < Partition
4
+ # Calculates the ID for this partition
5
+ # Think of it as a statistical 'bin'
6
+ #
7
+ # In this case we just use the category id
8
+ #
9
+ def self.partition_id(category_id)
10
+ category_id
11
+ end
12
+
13
+ def keys
14
+ { account_id: account_id, category_id: partition_id }
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,24 @@
1
+ module Nfcollector
2
+ class Configuration
3
+ class_attribute :ar_class
4
+ class_attribute :output_dir
5
+ class_attribute :categories_lookup
6
+ class_attribute :logger
7
+
8
+ # Set the class or object that will be used to commit domain categorisations (must respond to commit and take a hash)
9
+ class_attribute :categorisation_domains_committer
10
+
11
+ # Set the class or object that will be used to commit host categorisations (must respond to commit and take a hash)
12
+ class_attribute :categorisation_hosts_committer
13
+
14
+ # Set the class or object that will be used to commit users
15
+ class_attribute :user_committer
16
+
17
+ # Delete files after processing (default true)
18
+ class_attribute :delete_files
19
+
20
+ def self.set_defaults!
21
+ self.delete_files = true
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,47 @@
1
+
2
+ module Nfcollector
3
+ class UnexpectedRowLength < NfcollectorException; end
4
+ class FileEmpty < NfcollectorException; end
5
+
6
+ class CopyFileWriter
7
+ attr_reader :partition
8
+ delegate :file_name, to: :partition
9
+ delegate :table_name, to: :partition
10
+ delegate :headers, to: :partition
11
+
12
+ def initialize(partition)
13
+ @partition = partition
14
+ prepare
15
+ end
16
+
17
+ def self.open(partition)
18
+ cfw = self.new(partition)
19
+ yield cfw if block_given?
20
+ cfw
21
+ end
22
+
23
+ def commit!
24
+ begin
25
+ csv = CSV.new(@file, :force_quotes => true, :skip_blanks => true)
26
+ partition.rows.each { |row| csv << row }
27
+ rescue => ex
28
+ @error = ex
29
+ ensure
30
+ @file.close
31
+ end
32
+ handle_error!(@error) if @error
33
+ end
34
+
35
+ private
36
+ def prepare
37
+ @file = File.open(file_name, 'w')
38
+ @file.puts "-- Created at: #{Time.now.utc}"
39
+ @file.puts "COPY #{table_name} (account_id,#{headers.join(',')}) FROM stdin WITH csv;"
40
+ end
41
+
42
+ def handle_error!(ex)
43
+ File.delete(file_name) if File.file?(file_name)
44
+ raise ex
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,49 @@
1
+
2
+ class Domainatrix::Url
3
+ attr_accessor :query
4
+
5
+ def toplevel
6
+ [ domain, public_suffix ].compact.join(".")
7
+ end
8
+ end
9
+
10
+ class NFODomainParser < Domainatrix::DomainParser
11
+
12
+ def parse(url)
13
+ uri = URI.parse(url)
14
+ Domainatrix::Url.new(parse_domains_from_host(uri.host).merge({
15
+ :scheme => uri.scheme,
16
+ :host => uri.host,
17
+ :path => uri.path,
18
+ :query => uri.query,
19
+ :url => url
20
+ }))
21
+ end
22
+
23
+ # TODO: This is a big monkey patch - we should be forking and fixing this
24
+ def parse_domains_from_host(host)
25
+ parts = host.split(".").reverse
26
+ public_suffix = []
27
+ domain = ""
28
+ subdomains = []
29
+ sub_hash = @public_suffixes
30
+ parts.each_index do |i|
31
+ part = parts[i]
32
+ sub_parts = sub_hash[part]
33
+ sub_hash = sub_parts
34
+ if sub_parts.empty? || !sub_parts.has_key?(parts[i+1])
35
+ public_suffix << part
36
+ domain = parts[i+1]
37
+ subdomains = parts.slice(i+2, parts.size)
38
+ break
39
+ else
40
+ public_suffix << part
41
+ end
42
+ end
43
+ {:public_suffix => public_suffix.reverse.join("."), :domain => domain, :subdomain => subdomains.reverse.join(".")}
44
+ rescue
45
+ # Applies to IP Addresses here too
46
+ {:public_suffix => nil, :domain => host, :subdomain => nil}
47
+ end
48
+
49
+ end