nfcollector 3.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/.rspec +3 -0
- data/.rvmrc +1 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +1 -0
- data/lib/nfcollector.rb +41 -0
- data/lib/nfcollector/attribute_validator.rb +59 -0
- data/lib/nfcollector/attributes.rb +99 -0
- data/lib/nfcollector/categoriser.rb +43 -0
- data/lib/nfcollector/category_partition.rb +17 -0
- data/lib/nfcollector/configuration.rb +24 -0
- data/lib/nfcollector/copy_file_writer.rb +47 -0
- data/lib/nfcollector/domain_parser.rb +49 -0
- data/lib/nfcollector/input_definition.rb +31 -0
- data/lib/nfcollector/mapping.rb +7 -0
- data/lib/nfcollector/mapping/categories_processor.rb +36 -0
- data/lib/nfcollector/mapping/column_transpiler.rb +29 -0
- data/lib/nfcollector/mapping/default_output.rb +45 -0
- data/lib/nfcollector/mapping/effective_tld_names.dat +4394 -0
- data/lib/nfcollector/mapping/indexer.rb +21 -0
- data/lib/nfcollector/mapping/mapped_row.rb +21 -0
- data/lib/nfcollector/mapping/output.rb +59 -0
- data/lib/nfcollector/mapping/transpiler.rb +92 -0
- data/lib/nfcollector/nfcollector_exception.rb +4 -0
- data/lib/nfcollector/partition.rb +76 -0
- data/lib/nfcollector/partitioner.rb +37 -0
- data/lib/nfcollector/payload_processor.rb +46 -0
- data/lib/nfcollector/sequence_generator.rb +11 -0
- data/lib/nfcollector/version.rb +3 -0
- data/lib/nfcollector/weblog_partition.rb +26 -0
- data/nfcollector.gemspec +30 -0
- data/spec/attribute_validator_spec.rb +23 -0
- data/spec/attributes_spec.rb +15 -0
- data/spec/command_parser_spec.rb +81 -0
- data/spec/copy_file_writer_spec.rb +95 -0
- data/spec/input_definition_spec.rb +18 -0
- data/spec/nfcollector/category_partitioner_spec.rb +51 -0
- data/spec/nfcollector/date_partitioner_spec.rb +19 -0
- data/spec/nfcollector/input_definition_spec.rb +32 -0
- data/spec/nfcollector/mapping/column_transpiler_spec.rb +26 -0
- data/spec/nfcollector/mapping/output_spec.rb +76 -0
- data/spec/nfcollector/mapping/transpiler_spec.rb +47 -0
- data/spec/payload_job_spec.rb +11 -0
- data/spec/payload_processor_spec.rb +114 -0
- data/spec/spec_helper.rb +89 -0
- data/test/domains_hosts +194826 -0
- data/test/generate_input.rb +79 -0
- data/test/input/input-1000.csv +1000 -0
- data/test/input/input-100000.csv +100000 -0
- data/test/input/input-100000.dat +64039 -0
- data/test/input/input-no-tags.csv +3 -0
- data/test/input/input-no-tags.dat +3 -0
- data/test/input/input-no-tags.gz +0 -0
- data/test/input/input-with-tags.csv.gz +0 -0
- data/test/test_helper.rb +15 -0
- data/test/tester.rb +32 -0
- metadata +252 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 00d69574752ea8a74595a80a94dbbe8120d05fd1
|
4
|
+
data.tar.gz: 1411de4b5124e5b1841543b6dade0461b9b7e57e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 6a4ddd1387607a78ba9f8b3c86e83e73b2c7aff2ecdf0f927e171e968571d68ac3d76c814583873abdf81a17adf931f7c797089b5942dd6c40efead201c6bc36
|
7
|
+
data.tar.gz: 602f74db253c3f55c60984dc4f3286bc8224ee071ea07f4aace9a78c6c8fc911ff2cff2b5acfc8126c65f8867d65b2ac2ba86e72b626f041fc916519765f9685
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.rvmrc
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
rvm use 2.1.2@nfcollector2 --create
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Dan Draper
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# Nfcollector
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'nfcollector'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install nfcollector
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
TODO: Write usage instructions here
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/lib/nfcollector.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__)) unless
|
2
|
+
$:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require "bundler/setup"
|
6
|
+
require 'socket'
|
7
|
+
require 'csv'
|
8
|
+
require 'zlib'
|
9
|
+
require 'time'
|
10
|
+
require 'domainatrix'
|
11
|
+
require 'base64'
|
12
|
+
require 'active_support/core_ext/string/conversions'
|
13
|
+
require 'active_support/core_ext/object/with_options'
|
14
|
+
require 'active_support/core_ext/class/attribute'
|
15
|
+
|
16
|
+
require 'nfcollector/configuration'
|
17
|
+
require 'nfcollector/input_definition'
|
18
|
+
|
19
|
+
require 'nfcollector/mapping'
|
20
|
+
require 'nfcollector/sequence_generator'
|
21
|
+
require 'nfcollector/nfcollector_exception'
|
22
|
+
require 'nfcollector/attribute_validator'
|
23
|
+
require 'nfcollector/attributes'
|
24
|
+
require 'nfcollector/categoriser'
|
25
|
+
require 'nfcollector/copy_file_writer'
|
26
|
+
require 'nfcollector/domain_parser'
|
27
|
+
require 'nfcollector/partitioner'
|
28
|
+
require 'nfcollector/partition'
|
29
|
+
require 'nfcollector/category_partition'
|
30
|
+
require 'nfcollector/weblog_partition'
|
31
|
+
require 'nfcollector/payload_processor'
|
32
|
+
|
33
|
+
module Nfcollector
|
34
|
+
def self.config
|
35
|
+
Configuration.tap do |config|
|
36
|
+
yield config if block_given?
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
Nfcollector.config.set_defaults!
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module Nfcollector
|
2
|
+
class MissingRequiredAttribute < NfcollectorException
|
3
|
+
attr_reader :attr
|
4
|
+
|
5
|
+
def initialize(attr)
|
6
|
+
@attr = attr
|
7
|
+
super
|
8
|
+
end
|
9
|
+
|
10
|
+
def message
|
11
|
+
"Missing attribute '#{attr}'"
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class DuplicateAttribute < NfcollectorException
|
16
|
+
def message
|
17
|
+
"Duplicate attribute detected"
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class UnknownAttribute < NfcollectorException
|
22
|
+
def message
|
23
|
+
"Unknown Attribute #{super}"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class AttributeValidator
|
28
|
+
def initialize(attributes)
|
29
|
+
@attributes = attributes
|
30
|
+
end
|
31
|
+
|
32
|
+
# Takes the Attribute String
|
33
|
+
def validate!
|
34
|
+
attrs = @attributes.split(",")
|
35
|
+
|
36
|
+
# Known Attrs
|
37
|
+
attrs.each do |attr|
|
38
|
+
unless Attributes::MAPPINGS.keys.include?(attr)
|
39
|
+
raise UnknownAttribute.new(attr)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Dupes
|
44
|
+
attrs.inject([]) do |accum, attr|
|
45
|
+
raise DuplicateAttribute.new(attr) if accum.include?(attr)
|
46
|
+
accum << attr
|
47
|
+
end
|
48
|
+
|
49
|
+
# Required Attrs
|
50
|
+
Attributes::REQUIRED.each do |attr|
|
51
|
+
raise MissingRequiredAttribute.new(attr) unless attrs.include?(attr)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.validate!(attributes)
|
56
|
+
self.new(attributes).validate!
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,99 @@
|
|
1
|
+
module Nfcollector
|
2
|
+
class Attributes
|
3
|
+
|
4
|
+
ATTR_CREATED_AT = 't'
|
5
|
+
ATTR_CLIENT_IP = '>a'
|
6
|
+
ATTR_CACHED = 'Cs'
|
7
|
+
ATTR_HTTP_RESP = 'Hs'
|
8
|
+
ATTR_BYTES = '<s'
|
9
|
+
ATTR_HOST = 'Rh'
|
10
|
+
ATTR_PATH = 'Rp'
|
11
|
+
ATTR_USERNAME = 'Un'
|
12
|
+
ATTR_MIME_TYPE = 'mt'
|
13
|
+
ATTR_REQUEST_RESPONSE = 'Rr'
|
14
|
+
ATTR_USER_GROUP = 'Ug'
|
15
|
+
ATTR_CLIENT_FQDN = '>A'
|
16
|
+
ATTR_COMPUTER_GROUP = '>G'
|
17
|
+
ATTR_CATEGORY = 'Rc'
|
18
|
+
|
19
|
+
# Input Attributes mapped to their column or map reduce names
|
20
|
+
MAPPINGS = {
|
21
|
+
ATTR_CREATED_AT => :created_at,
|
22
|
+
ATTR_CLIENT_IP => :client_ip,
|
23
|
+
ATTR_CACHED => :cached,
|
24
|
+
ATTR_HTTP_RESP => :http_resp_code,
|
25
|
+
ATTR_BYTES => :bytes,
|
26
|
+
ATTR_HOST => :host,
|
27
|
+
ATTR_PATH => :path,
|
28
|
+
ATTR_USERNAME => :username,
|
29
|
+
ATTR_MIME_TYPE => :mime_type,
|
30
|
+
ATTR_REQUEST_RESPONSE => :request_response,
|
31
|
+
ATTR_USER_GROUP => :user_group,
|
32
|
+
ATTR_CLIENT_FQDN => :client_fqdn,
|
33
|
+
ATTR_COMPUTER_GROUP => :computer_group,
|
34
|
+
ATTR_CATEGORY => :category
|
35
|
+
}
|
36
|
+
|
37
|
+
COPY_FILE_COLUMNS = [
|
38
|
+
:created_at,
|
39
|
+
:client_ip,
|
40
|
+
:cached,
|
41
|
+
:http_resp_code,
|
42
|
+
:bytes,
|
43
|
+
:host,
|
44
|
+
:path,
|
45
|
+
:username,
|
46
|
+
:mime_type,
|
47
|
+
:request_response,
|
48
|
+
:user_group,
|
49
|
+
:client_fqdn,
|
50
|
+
:computer_group,
|
51
|
+
:domain
|
52
|
+
]
|
53
|
+
|
54
|
+
MAP_REDUCE_COLUMNS = [
|
55
|
+
:created_at,
|
56
|
+
:bytes,
|
57
|
+
:host,
|
58
|
+
:username,
|
59
|
+
:domain,
|
60
|
+
:category
|
61
|
+
]
|
62
|
+
|
63
|
+
REQUIRED = [
|
64
|
+
ATTR_CREATED_AT,
|
65
|
+
ATTR_CLIENT_IP,
|
66
|
+
ATTR_CACHED,
|
67
|
+
ATTR_HTTP_RESP,
|
68
|
+
ATTR_BYTES,
|
69
|
+
ATTR_HOST,
|
70
|
+
ATTR_PATH,
|
71
|
+
ATTR_USERNAME,
|
72
|
+
ATTR_MIME_TYPE
|
73
|
+
]
|
74
|
+
|
75
|
+
# Selects from an array of column names
|
76
|
+
# only the ones that can be used for copy files
|
77
|
+
def self.for_copy_file(arr)
|
78
|
+
# TODO: No longer needed?
|
79
|
+
arr.select { |item|
|
80
|
+
COPY_FILE_COLUMNS.include?(item.to_sym)
|
81
|
+
}.map(&:to_sym)
|
82
|
+
end
|
83
|
+
|
84
|
+
# Parses a comma separated list of attrs into
|
85
|
+
# an array of column names as symbols
|
86
|
+
def self.parse(attributes_string)
|
87
|
+
InputDefinition.new.tap do |definition|
|
88
|
+
attributes_string.split(',').each_with_index do |attr, index|
|
89
|
+
definition.set(index, MAPPINGS[attr])
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# Returns the column name associated with the given attr code
|
95
|
+
def self.map(attr_code)
|
96
|
+
MAPPINGS[attr_code.to_s]
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Nfcollector
|
2
|
+
class Categoriser
|
3
|
+
class Entry
|
4
|
+
attr_reader :last_seen
|
5
|
+
attr_reader :hits
|
6
|
+
attr_reader :categories
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@hits = 0
|
10
|
+
@last_seen = Time.at(0)
|
11
|
+
@categories = Set.new
|
12
|
+
end
|
13
|
+
|
14
|
+
def update(last_seen, category_ids)
|
15
|
+
@hits += 1
|
16
|
+
@last_seen = last_seen if @last_seen < last_seen
|
17
|
+
@categories.merge(category_ids)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def initialize(account_id)
|
22
|
+
@account_id = account_id
|
23
|
+
@domains = {}
|
24
|
+
@hosts = {}
|
25
|
+
end
|
26
|
+
|
27
|
+
#def perform(domain, host, last_seen)
|
28
|
+
def perform(mapped_row, indicies)
|
29
|
+
domain, host, last_seen = mapped_row.values_at(*indicies)
|
30
|
+
@domains[domain] ||= Entry.new
|
31
|
+
@hosts[host] ||= Entry.new
|
32
|
+
# TODO: This is doing TWO lookups
|
33
|
+
@domains[domain].update(last_seen, mapped_row.category_ids)
|
34
|
+
@hosts[host].update(last_seen, mapped_row.category_ids)
|
35
|
+
end
|
36
|
+
|
37
|
+
def commit!
|
38
|
+
# TODO: This feels a bit clunky...
|
39
|
+
Configuration.categorisation_domains_committer.commit(@account_id, @domains)
|
40
|
+
Configuration.categorisation_hosts_committer.commit(@account_id, @hosts)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
|
2
|
+
module Nfcollector
|
3
|
+
class CategoryPartition < Partition
|
4
|
+
# Calculates the ID for this partition
|
5
|
+
# Think of it as a statistical 'bin'
|
6
|
+
#
|
7
|
+
# In this case we just use the category id
|
8
|
+
#
|
9
|
+
def self.partition_id(category_id)
|
10
|
+
category_id
|
11
|
+
end
|
12
|
+
|
13
|
+
def keys
|
14
|
+
{ account_id: account_id, category_id: partition_id }
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Nfcollector
|
2
|
+
class Configuration
|
3
|
+
class_attribute :ar_class
|
4
|
+
class_attribute :output_dir
|
5
|
+
class_attribute :categories_lookup
|
6
|
+
class_attribute :logger
|
7
|
+
|
8
|
+
# Set the class or object that will be used to commit domain categorisations (must respond to commit and take a hash)
|
9
|
+
class_attribute :categorisation_domains_committer
|
10
|
+
|
11
|
+
# Set the class or object that will be used to commit host categorisations (must respond to commit and take a hash)
|
12
|
+
class_attribute :categorisation_hosts_committer
|
13
|
+
|
14
|
+
# Set the class or object that will be used to commit users
|
15
|
+
class_attribute :user_committer
|
16
|
+
|
17
|
+
# Delete files after processing (default true)
|
18
|
+
class_attribute :delete_files
|
19
|
+
|
20
|
+
def self.set_defaults!
|
21
|
+
self.delete_files = true
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
|
2
|
+
module Nfcollector
|
3
|
+
class UnexpectedRowLength < NfcollectorException; end
|
4
|
+
class FileEmpty < NfcollectorException; end
|
5
|
+
|
6
|
+
class CopyFileWriter
|
7
|
+
attr_reader :partition
|
8
|
+
delegate :file_name, to: :partition
|
9
|
+
delegate :table_name, to: :partition
|
10
|
+
delegate :headers, to: :partition
|
11
|
+
|
12
|
+
def initialize(partition)
|
13
|
+
@partition = partition
|
14
|
+
prepare
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.open(partition)
|
18
|
+
cfw = self.new(partition)
|
19
|
+
yield cfw if block_given?
|
20
|
+
cfw
|
21
|
+
end
|
22
|
+
|
23
|
+
def commit!
|
24
|
+
begin
|
25
|
+
csv = CSV.new(@file, :force_quotes => true, :skip_blanks => true)
|
26
|
+
partition.rows.each { |row| csv << row }
|
27
|
+
rescue => ex
|
28
|
+
@error = ex
|
29
|
+
ensure
|
30
|
+
@file.close
|
31
|
+
end
|
32
|
+
handle_error!(@error) if @error
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
def prepare
|
37
|
+
@file = File.open(file_name, 'w')
|
38
|
+
@file.puts "-- Created at: #{Time.now.utc}"
|
39
|
+
@file.puts "COPY #{table_name} (account_id,#{headers.join(',')}) FROM stdin WITH csv;"
|
40
|
+
end
|
41
|
+
|
42
|
+
def handle_error!(ex)
|
43
|
+
File.delete(file_name) if File.file?(file_name)
|
44
|
+
raise ex
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
|
2
|
+
class Domainatrix::Url
|
3
|
+
attr_accessor :query
|
4
|
+
|
5
|
+
def toplevel
|
6
|
+
[ domain, public_suffix ].compact.join(".")
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
class NFODomainParser < Domainatrix::DomainParser
|
11
|
+
|
12
|
+
def parse(url)
|
13
|
+
uri = URI.parse(url)
|
14
|
+
Domainatrix::Url.new(parse_domains_from_host(uri.host).merge({
|
15
|
+
:scheme => uri.scheme,
|
16
|
+
:host => uri.host,
|
17
|
+
:path => uri.path,
|
18
|
+
:query => uri.query,
|
19
|
+
:url => url
|
20
|
+
}))
|
21
|
+
end
|
22
|
+
|
23
|
+
# TODO: This is a big monkey patch - we should be forking and fixing this
|
24
|
+
def parse_domains_from_host(host)
|
25
|
+
parts = host.split(".").reverse
|
26
|
+
public_suffix = []
|
27
|
+
domain = ""
|
28
|
+
subdomains = []
|
29
|
+
sub_hash = @public_suffixes
|
30
|
+
parts.each_index do |i|
|
31
|
+
part = parts[i]
|
32
|
+
sub_parts = sub_hash[part]
|
33
|
+
sub_hash = sub_parts
|
34
|
+
if sub_parts.empty? || !sub_parts.has_key?(parts[i+1])
|
35
|
+
public_suffix << part
|
36
|
+
domain = parts[i+1]
|
37
|
+
subdomains = parts.slice(i+2, parts.size)
|
38
|
+
break
|
39
|
+
else
|
40
|
+
public_suffix << part
|
41
|
+
end
|
42
|
+
end
|
43
|
+
{:public_suffix => public_suffix.reverse.join("."), :domain => domain, :subdomain => subdomains.reverse.join(".")}
|
44
|
+
rescue
|
45
|
+
# Applies to IP Addresses here too
|
46
|
+
{:public_suffix => nil, :domain => host, :subdomain => nil}
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|