nfcollector 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/.rspec +3 -0
  4. data/.rvmrc +1 -0
  5. data/Gemfile +8 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +29 -0
  8. data/Rakefile +1 -0
  9. data/lib/nfcollector.rb +41 -0
  10. data/lib/nfcollector/attribute_validator.rb +59 -0
  11. data/lib/nfcollector/attributes.rb +99 -0
  12. data/lib/nfcollector/categoriser.rb +43 -0
  13. data/lib/nfcollector/category_partition.rb +17 -0
  14. data/lib/nfcollector/configuration.rb +24 -0
  15. data/lib/nfcollector/copy_file_writer.rb +47 -0
  16. data/lib/nfcollector/domain_parser.rb +49 -0
  17. data/lib/nfcollector/input_definition.rb +31 -0
  18. data/lib/nfcollector/mapping.rb +7 -0
  19. data/lib/nfcollector/mapping/categories_processor.rb +36 -0
  20. data/lib/nfcollector/mapping/column_transpiler.rb +29 -0
  21. data/lib/nfcollector/mapping/default_output.rb +45 -0
  22. data/lib/nfcollector/mapping/effective_tld_names.dat +4394 -0
  23. data/lib/nfcollector/mapping/indexer.rb +21 -0
  24. data/lib/nfcollector/mapping/mapped_row.rb +21 -0
  25. data/lib/nfcollector/mapping/output.rb +59 -0
  26. data/lib/nfcollector/mapping/transpiler.rb +92 -0
  27. data/lib/nfcollector/nfcollector_exception.rb +4 -0
  28. data/lib/nfcollector/partition.rb +76 -0
  29. data/lib/nfcollector/partitioner.rb +37 -0
  30. data/lib/nfcollector/payload_processor.rb +46 -0
  31. data/lib/nfcollector/sequence_generator.rb +11 -0
  32. data/lib/nfcollector/version.rb +3 -0
  33. data/lib/nfcollector/weblog_partition.rb +26 -0
  34. data/nfcollector.gemspec +30 -0
  35. data/spec/attribute_validator_spec.rb +23 -0
  36. data/spec/attributes_spec.rb +15 -0
  37. data/spec/command_parser_spec.rb +81 -0
  38. data/spec/copy_file_writer_spec.rb +95 -0
  39. data/spec/input_definition_spec.rb +18 -0
  40. data/spec/nfcollector/category_partitioner_spec.rb +51 -0
  41. data/spec/nfcollector/date_partitioner_spec.rb +19 -0
  42. data/spec/nfcollector/input_definition_spec.rb +32 -0
  43. data/spec/nfcollector/mapping/column_transpiler_spec.rb +26 -0
  44. data/spec/nfcollector/mapping/output_spec.rb +76 -0
  45. data/spec/nfcollector/mapping/transpiler_spec.rb +47 -0
  46. data/spec/payload_job_spec.rb +11 -0
  47. data/spec/payload_processor_spec.rb +114 -0
  48. data/spec/spec_helper.rb +89 -0
  49. data/test/domains_hosts +194826 -0
  50. data/test/generate_input.rb +79 -0
  51. data/test/input/input-1000.csv +1000 -0
  52. data/test/input/input-100000.csv +100000 -0
  53. data/test/input/input-100000.dat +64039 -0
  54. data/test/input/input-no-tags.csv +3 -0
  55. data/test/input/input-no-tags.dat +3 -0
  56. data/test/input/input-no-tags.gz +0 -0
  57. data/test/input/input-with-tags.csv.gz +0 -0
  58. data/test/test_helper.rb +15 -0
  59. data/test/tester.rb +32 -0
  60. metadata +252 -0
@@ -0,0 +1,15 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nfcollector::Attributes do
4
+ describe '::parse' do
5
+ let(:attributes_string) { '>a,Rh,Rp' }
6
+ subject { described_class.parse(attributes_string) }
7
+
8
+ specify do
9
+ expect(subject).to be_a(Nfcollector::InputDefinition)
10
+ expect(subject.column_index(:client_ip)).to eq(0)
11
+ expect(subject.column_index(:host)).to eq(1)
12
+ expect(subject.column_index(:path)).to eq(2)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,81 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nfcollector::CommandParser do
4
+
5
+ describe "Instance" do
6
+ specify "raise if account_id is not an int" do
7
+ expect {
8
+ cp = described_class.new("abc", "/file/path", ">a")
9
+ cp.validate!
10
+ }.to raise_error(Nfcollector::InvalidCommand, "Account ID is not an Integer")
11
+ end
12
+
13
+ specify "raise if file not found" do
14
+ allow(File).to receive(:file?).and_return(false)
15
+ expect {
16
+ cp = described_class.new(10, "/file/path", ">a")
17
+ cp.validate!
18
+ }.to raise_error(Nfcollector::InvalidCommand, "No such file for payload")
19
+ end
20
+
21
+ specify "raise if attributes are not valid" do
22
+ allow(File).to receive(:file?).and_return(true)
23
+ allow(
24
+ Nfcollector::AttributeValidator
25
+ ).to receive(:validate!).
26
+ with(">a").and_raise(Nfcollector::UnknownAttribute)
27
+
28
+ expect {
29
+ cp = described_class.new(10, "/file/path", 'X')
30
+ cp.validate!
31
+ }.to raise_error(Nfcollector::UnknownAttribute)
32
+ end
33
+
34
+ specify "raise if attributes are missing" do
35
+ allow(File).to receive(:file?).and_return(true)
36
+ expect {
37
+ cp = described_class.new(10, "/file/path", nil)
38
+ cp.validate!
39
+ }.to raise_error(Nfcollector::InvalidCommand)
40
+ end
41
+
42
+ specify "raise if attributes are empty" do
43
+ allow(File).to receive(:file?).and_return(true)
44
+ expect {
45
+ cp = described_class.new(10, "/file/path", "")
46
+ cp.validate!
47
+ }.to raise_error(Nfcollector::InvalidCommand)
48
+ end
49
+
50
+ specify "not raise if data is correct (string account id)" do
51
+ allow(File).to receive(:file?).and_return(true)
52
+ allow(Nfcollector::AttributeValidator).to receive(:validate!).and_return(true)
53
+ cp = described_class.new("100", "/file/path", Nfcollector::Attributes::REQUIRED.join(','))
54
+ cp.validate!
55
+ end
56
+
57
+ specify "not raise if data is correct (numeric account id)" do
58
+ allow(File).to receive(:file?).and_return(true)
59
+ allow(Nfcollector::AttributeValidator).to receive(:validate!).and_return(true)
60
+ expect {
61
+ cp = described_class.new(100, "/file/path", Nfcollector::Attributes::REQUIRED.join(','))
62
+ cp.validate!
63
+ }.to_not raise_error
64
+ end
65
+ end
66
+
67
+ context "Class" do
68
+ specify "raise if not all tokens are provided" do
69
+ expect {
70
+ described_class.parse("10:")
71
+ }.to raise_error(Nfcollector::InvalidCommand)
72
+ end
73
+
74
+ specify "validate! provided tokens" do
75
+ cp = stub(:parser)
76
+ expect(cp).to receive(:validate)
77
+ expect(described_class).to receive(:new).with(10, '/foo/bar', '>a').and_return(cp)
78
+ described_class.parse("10:/foo/bar:>a")
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,95 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nfcollector::CopyFileWriter do
4
+ describe "Adding rows" do
5
+ # TODO: Test too few columns as well? Will this just result in NULLs?
6
+ specify "ignore additional data if a row is added that has more columns set in the constructor" do
7
+ expect {
8
+ copy = described_class.new(%w(created_at username bytes), 10, 0)
9
+ copy.add_row([Time.now, "daniel", 1000, "foo"])
10
+ }.to_not raise_error
11
+ end
12
+ end
13
+
14
+ describe "Table Name" do
15
+ # be correct for the given date" do
16
+ pending "Something needs to happen here but I don't know what"
17
+ end
18
+
19
+ describe "File Name" do
20
+ specify "that it is correct" do
21
+ Nfcollector::Configuration.output_dir = "/tmp/"
22
+ date = stub(:date)
23
+ copy = described_class.new(%w(created_at username bytes), 10, 0)
24
+ expect(copy).to receive(:table_name).and_return("mocked_table_name")
25
+ allow(copy).to receive(:randstr).and_return('randstr')
26
+ expect(copy.file_name(Date.today)).to eq("/tmp/mocked_table_name_randstr.copy")
27
+ end
28
+ end
29
+
30
+ describe "Writing" do
31
+ before do
32
+ allow_any_instance_of(described_class).to receive(:table_name).and_return('mocked_table_name')
33
+ Timecop.freeze(Time.new(2011, 7, 11, 0, 0, 0, "+10:00"))
34
+ end
35
+
36
+ after do
37
+ Timecop.return
38
+ end
39
+
40
+ specify "write the file" do
41
+ copy = described_class.new(%w(created_at username bytes), 10, 0)
42
+ copy.add_row([Time.now.utc, "daniel", 1000])
43
+ io = StringIO.new
44
+ expect(File).to receive(:open).and_yield(io)
45
+ allow(copy).to receive(:randstr).and_return('randstr')
46
+ expect(FileUtils).to receive(:mv).with('/tmp/mocked_table_name_randstr.copy.lock', '/tmp/mocked_table_name_randstr.copy')
47
+ copy.write
48
+ expect(io.string).to eq <<-COPY
49
+ -- Created at: 2011-07-10 14:00:00 UTC
50
+ COPY mocked_table_name (created_at,username,bytes) FROM stdin WITH csv;
51
+ "2011-07-10 14:00:00 UTC","daniel","1000"
52
+ COPY
53
+ end
54
+
55
+ specify "write the file and handle NULLs" do
56
+ copy = described_class.new(%w(created_at username bytes user_group), 10, 0)
57
+ copy.add_row([Time.now.utc, 'daniel', '1000', nil])
58
+ io = StringIO.new
59
+ expect(File).to receive(:open).and_yield(io)
60
+ expect(copy).to receive(:randstr).and_return('randstr')
61
+ expect(FileUtils).to receive(:mv).with('/tmp/mocked_table_name_randstr.copy.lock', '/tmp/mocked_table_name_randstr.copy')
62
+ copy.write
63
+ expect(io.string).to eq <<-COPY
64
+ -- Created at: 2011-07-10 14:00:00 UTC
65
+ COPY mocked_table_name (created_at,username,bytes,user_group) FROM stdin WITH csv;
66
+ "2011-07-10 14:00:00 UTC","daniel","1000",""
67
+ COPY
68
+ end
69
+
70
+ specify "delete the resultant file if there is an exception" do
71
+ copy = described_class.new(%w(created_at username bytes user_group), 10, 0)
72
+ copy.add_row([Time.now, 'daniel', '1000', nil])
73
+ expect(CSV).to receive(:new).and_raise(RuntimeError)
74
+ allow(File).to receive(:file?).and_return(true)
75
+ expect(File).to receive(:delete)
76
+ expect { copy.write }.to raise_error(RuntimeError)
77
+ end
78
+
79
+ specify "raise if there are no rows" do
80
+ copy = described_class.new(%w(created_at username bytes user_group), 10, 0)
81
+ expect { copy.write }.to raise_error(Nfcollector::FileEmpty)
82
+ end
83
+
84
+ # TODO: This is not tested effectively
85
+ specify "return false if a new copy file should be generated (date barrier)" do
86
+ copy = described_class.new(%w(created_at username bytes), 10, 0)
87
+ copy.add_row([Time.gm(2011, 7, 11, 0, 0, 0), 'daniel', '1000'])
88
+ copy.add_row([Time.gm(2011, 7, 12, 0, 0, 0), 'daniel', '1000'])
89
+ allow(copy).to receive(:randstr).and_return('randstr')
90
+ expect(FileUtils).to receive(:mv).with('/tmp/mocked_table_name_randstr.copy.lock', '/tmp/mocked_table_name_randstr.copy').twice
91
+ expect(File).to receive(:open).twice
92
+ copy.write
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,18 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nfcollector::InputDefinition do
4
+ describe '#set' do
5
+ let(:definition) { described_class.new }
6
+ before { definition.set(1, :username) }
7
+
8
+ specify do
9
+ expect(definition.column_index(:username)).to eq(1)
10
+ end
11
+
12
+ specify 'that an error is raised if there is no index for the column' do
13
+ expect {
14
+ definition.column_index(:foo)
15
+ }.to raise_error(Nfcollector::InputDefinition::MissingDefinition)
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,51 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nfcollector::CategoryPartitioner do
4
+ let(:partitioner) { described_class.new(2) }
5
+
6
+ describe '#partition_id' do
7
+ subject { partitioner.partition_id(row) }
8
+
9
+ context 'both columns available' do
10
+ let(:row) { %w(1000 100 10) }
11
+ specify { expect(subject).to eq('10') }
12
+ end
13
+ end
14
+
15
+ describe '#add_row' do
16
+ let(:row) { %w(1000 100 10) }
17
+ subject { partitioner }
18
+
19
+ before { subject.add_row(row) }
20
+
21
+ specify do
22
+ expect(subject.data).to eq({
23
+ '10' => [row]
24
+ })
25
+ end
26
+
27
+ describe 'adding a second row with the same partition id' do
28
+ let(:row2) { %w(1001 100 10) }
29
+ before { subject.add_row(row2) }
30
+
31
+ specify do
32
+ expect(subject.data).to eq({
33
+ '10' => [row, row2]
34
+ })
35
+ end
36
+ end
37
+
38
+ describe 'adding a second row with the a different partition id' do
39
+ let(:row3) { %w(1001 100 11) }
40
+ before { subject.add_row(row3) }
41
+
42
+ specify do
43
+ expect(subject.data).to eq({
44
+ '10' => [row],
45
+ '11' => [row3]
46
+ })
47
+ end
48
+ end
49
+
50
+ end
51
+ end
@@ -0,0 +1,19 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nfcollector::DatePartitioner do
4
+ let(:partitioner) { described_class.new(2) }
5
+
6
+ describe '#partition_id' do
7
+ subject { partitioner.partition_id(row) }
8
+
9
+ context 'the timestamp in UTC is on the same day' do
10
+ let(:row) { [ 1000, 100, '2014-06-16 10:29:52 +1000'.to_time ] }
11
+ specify { expect(subject.to_s).to eq('2014-06-16') }
12
+ end
13
+
14
+ context 'the timestamp in UTC is on the previous day' do
15
+ let(:row) { [ 1000, 100, '2014-06-16 9:29:52 +1000'.to_time ] }
16
+ specify { expect(subject.to_s).to eq('2014-06-15') }
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,32 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nfcollector::InputDefinition do
4
+ let(:input) { described_class.new }
5
+
6
+ describe '#set' do
7
+ before { input.set(0, :username) }
8
+ specify { expect(input.column_index(:username)).to eq(0) }
9
+ end
10
+
11
+ describe '#has_index_for?' do
12
+ context 'no column present' do
13
+ specify { expect(input.has_index_for?(:username)).to be(false) }
14
+ end
15
+
16
+ context 'column is present' do
17
+ before { input.set(0, :username) }
18
+ specify { expect(input.has_index_for?(:username)).to be(true) }
19
+ end
20
+
21
+ context 'mulitple columns present' do
22
+ before do
23
+ input.set(0, :username)
24
+ input.set(1, :client_ip)
25
+ end
26
+
27
+ specify do
28
+ expect(input.has_index_for?([:username, :client_ip])).to be(true)
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,26 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nfcollector::Mapping::ColumnTranspiler do
4
+ let(:output) { Nfcollector::Mapping::Output.new(:username) }
5
+ let(:input_row) { [ '2013-10-10 00:30', 'daniel' ] }
6
+ subject(:column_transpiler) { described_class.new(output) }
7
+
8
+ describe '#go' do
9
+ context 'not yet built' do
10
+ specify do
11
+ expect {
12
+ subject.go(input_row)
13
+ }.to raise_error(Nfcollector::Mapping::ColumnTranspiler::NotYetBuilt)
14
+ end
15
+ end
16
+
17
+ context 'has been built' do
18
+ let(:input_definition) { Nfcollector::Attributes.parse('t,Un') }
19
+ before { subject.build(input_definition) }
20
+
21
+ specify do
22
+ expect(subject.go(input_row)).to eq('daniel')
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,76 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nfcollector::Mapping::Output do
4
+ describe 'class DSL' do
5
+ describe 'base state' do
6
+ before do
7
+ class MyMapperA < Nfcollector::Mapping::Output; end
8
+ end
9
+
10
+ specify { expect(MyMapperA.outputs).to eq([]) }
11
+ end
12
+
13
+ describe 'with outputs defined' do
14
+ before do
15
+ class MyMapperB < Nfcollector::Mapping::Output
16
+ output :created_at
17
+ output :username
18
+ end
19
+ end
20
+
21
+ specify do
22
+ expect(MyMapperB.outputs.size).to eq(2)
23
+ expect(MyMapperB.outputs.first).to be_a(Nfcollector::Mapping::Output)
24
+ end
25
+ end
26
+
27
+ describe '#compile' do
28
+
29
+ end
30
+ end
31
+
32
+ describe '#initialize' do
33
+ context 'defaults' do
34
+ subject { described_class.new(:username) }
35
+
36
+ specify do
37
+ expect(subject.name).to eq(:username)
38
+ expect(subject.inputs).to eq([:username])
39
+ expect(subject.process_with).to eq(:username)
40
+ end
41
+ end
42
+
43
+ context 'one using specific input' do
44
+ subject { described_class.new(:username, inputs: :user) }
45
+ specify { expect(subject.inputs).to eq([:user]) }
46
+ end
47
+
48
+ context 'several specific inputs' do
49
+ subject { described_class.new(:username, inputs: [:user, :group]) }
50
+ specify { expect(subject.inputs).to eq([:user, :group]) }
51
+ end
52
+
53
+ context 'with a custom processing method' do
54
+ subject { described_class.new(:username, process_with: :do_stuff) }
55
+ specify { expect(subject.process_with).to eq(:do_stuff) }
56
+ end
57
+ end
58
+
59
+ describe '#optional' do
60
+ context 'unset' do
61
+ specify { expect(described_class.new(:username)).to_not be_optional }
62
+ end
63
+
64
+ context 'set' do
65
+ specify { expect(described_class.new(:username, optional: true)).to be_optional }
66
+ end
67
+ end
68
+
69
+ describe '#compile' do
70
+ pending
71
+ end
72
+
73
+ describe '#method_missing' do
74
+ pending
75
+ end
76
+ end
@@ -0,0 +1,47 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nfcollector::Mapping::Transpiler do
4
+ # TODO: Rename to add_output
5
+ describe '#add_column' do
6
+ let(:transpiler) { described_class.new(input_definition) }
7
+ let(:input_definition) { Nfcollector::Attributes.parse('>a,Rh,Rp') }
8
+ subject { transpiler.add_column(output) }
9
+
10
+ context 'input definition has index for column' do
11
+ let(:output) { Nfcollector::Mapping::Output.new(:client_ip) }
12
+
13
+ specify 'that we add an output' do
14
+ expect { subject }.to change { transpiler.outputs.size }.by(1)
15
+ end
16
+
17
+ specify 'that the added output is valid column transpiler' do
18
+ subject
19
+ expect(transpiler.outputs[0]).to be_a(Nfcollector::Mapping::ColumnTranspiler)
20
+ end
21
+ end
22
+
23
+ context 'input definition does not have index for column' do
24
+ context 'and output is required' do
25
+ let(:output) { Nfcollector::Mapping::Output.new(:username) }
26
+
27
+ specify 'that we DO NOT add an output' do
28
+ expect {
29
+ subject
30
+ }.to raise_error(Nfcollector::InputDefinition::MissingDefinition)
31
+ end
32
+ end
33
+
34
+ context 'and output is not required' do
35
+ let(:output) { Nfcollector::Mapping::Output.new(:username, optional: true) }
36
+
37
+ specify 'that we DO NOT add an output' do
38
+ expect { subject }.to_not change { transpiler.outputs.size }
39
+ end
40
+ end
41
+ end
42
+ end
43
+
44
+ describe '#transpile' do
45
+ pending
46
+ end
47
+ end