nfcollector 3.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/.rspec +3 -0
  4. data/.rvmrc +1 -0
  5. data/Gemfile +8 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +29 -0
  8. data/Rakefile +1 -0
  9. data/lib/nfcollector.rb +41 -0
  10. data/lib/nfcollector/attribute_validator.rb +59 -0
  11. data/lib/nfcollector/attributes.rb +99 -0
  12. data/lib/nfcollector/categoriser.rb +43 -0
  13. data/lib/nfcollector/category_partition.rb +17 -0
  14. data/lib/nfcollector/configuration.rb +24 -0
  15. data/lib/nfcollector/copy_file_writer.rb +47 -0
  16. data/lib/nfcollector/domain_parser.rb +49 -0
  17. data/lib/nfcollector/input_definition.rb +31 -0
  18. data/lib/nfcollector/mapping.rb +7 -0
  19. data/lib/nfcollector/mapping/categories_processor.rb +36 -0
  20. data/lib/nfcollector/mapping/column_transpiler.rb +29 -0
  21. data/lib/nfcollector/mapping/default_output.rb +45 -0
  22. data/lib/nfcollector/mapping/effective_tld_names.dat +4394 -0
  23. data/lib/nfcollector/mapping/indexer.rb +21 -0
  24. data/lib/nfcollector/mapping/mapped_row.rb +21 -0
  25. data/lib/nfcollector/mapping/output.rb +59 -0
  26. data/lib/nfcollector/mapping/transpiler.rb +92 -0
  27. data/lib/nfcollector/nfcollector_exception.rb +4 -0
  28. data/lib/nfcollector/partition.rb +76 -0
  29. data/lib/nfcollector/partitioner.rb +37 -0
  30. data/lib/nfcollector/payload_processor.rb +46 -0
  31. data/lib/nfcollector/sequence_generator.rb +11 -0
  32. data/lib/nfcollector/version.rb +3 -0
  33. data/lib/nfcollector/weblog_partition.rb +26 -0
  34. data/nfcollector.gemspec +30 -0
  35. data/spec/attribute_validator_spec.rb +23 -0
  36. data/spec/attributes_spec.rb +15 -0
  37. data/spec/command_parser_spec.rb +81 -0
  38. data/spec/copy_file_writer_spec.rb +95 -0
  39. data/spec/input_definition_spec.rb +18 -0
  40. data/spec/nfcollector/category_partitioner_spec.rb +51 -0
  41. data/spec/nfcollector/date_partitioner_spec.rb +19 -0
  42. data/spec/nfcollector/input_definition_spec.rb +32 -0
  43. data/spec/nfcollector/mapping/column_transpiler_spec.rb +26 -0
  44. data/spec/nfcollector/mapping/output_spec.rb +76 -0
  45. data/spec/nfcollector/mapping/transpiler_spec.rb +47 -0
  46. data/spec/payload_job_spec.rb +11 -0
  47. data/spec/payload_processor_spec.rb +114 -0
  48. data/spec/spec_helper.rb +89 -0
  49. data/test/domains_hosts +194826 -0
  50. data/test/generate_input.rb +79 -0
  51. data/test/input/input-1000.csv +1000 -0
  52. data/test/input/input-100000.csv +100000 -0
  53. data/test/input/input-100000.dat +64039 -0
  54. data/test/input/input-no-tags.csv +3 -0
  55. data/test/input/input-no-tags.dat +3 -0
  56. data/test/input/input-no-tags.gz +0 -0
  57. data/test/input/input-with-tags.csv.gz +0 -0
  58. data/test/test_helper.rb +15 -0
  59. data/test/tester.rb +32 -0
  60. metadata +252 -0
@@ -0,0 +1,15 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nfcollector::Attributes do
4
+ describe '::parse' do
5
+ let(:attributes_string) { '>a,Rh,Rp' }
6
+ subject { described_class.parse(attributes_string) }
7
+
8
+ specify do
9
+ expect(subject).to be_a(Nfcollector::InputDefinition)
10
+ expect(subject.column_index(:client_ip)).to eq(0)
11
+ expect(subject.column_index(:host)).to eq(1)
12
+ expect(subject.column_index(:path)).to eq(2)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,81 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nfcollector::CommandParser do
4
+
5
+ describe "Instance" do
6
+ specify "raise if account_id is not an int" do
7
+ expect {
8
+ cp = described_class.new("abc", "/file/path", ">a")
9
+ cp.validate!
10
+ }.to raise_error(Nfcollector::InvalidCommand, "Account ID is not an Integer")
11
+ end
12
+
13
+ specify "raise if file not found" do
14
+ allow(File).to receive(:file?).and_return(false)
15
+ expect {
16
+ cp = described_class.new(10, "/file/path", ">a")
17
+ cp.validate!
18
+ }.to raise_error(Nfcollector::InvalidCommand, "No such file for payload")
19
+ end
20
+
21
+ specify "raise if attributes are not valid" do
22
+ allow(File).to receive(:file?).and_return(true)
23
+ allow(
24
+ Nfcollector::AttributeValidator
25
+ ).to receive(:validate!).
26
+ with(">a").and_raise(Nfcollector::UnknownAttribute)
27
+
28
+ expect {
29
+ cp = described_class.new(10, "/file/path", 'X')
30
+ cp.validate!
31
+ }.to raise_error(Nfcollector::UnknownAttribute)
32
+ end
33
+
34
+ specify "raise if attributes are missing" do
35
+ allow(File).to receive(:file?).and_return(true)
36
+ expect {
37
+ cp = described_class.new(10, "/file/path", nil)
38
+ cp.validate!
39
+ }.to raise_error(Nfcollector::InvalidCommand)
40
+ end
41
+
42
+ specify "raise if attributes are empty" do
43
+ allow(File).to receive(:file?).and_return(true)
44
+ expect {
45
+ cp = described_class.new(10, "/file/path", "")
46
+ cp.validate!
47
+ }.to raise_error(Nfcollector::InvalidCommand)
48
+ end
49
+
50
+ specify "not raise if data is correct (string account id)" do
51
+ allow(File).to receive(:file?).and_return(true)
52
+ allow(Nfcollector::AttributeValidator).to receive(:validate!).and_return(true)
53
+ cp = described_class.new("100", "/file/path", Nfcollector::Attributes::REQUIRED.join(','))
54
+ cp.validate!
55
+ end
56
+
57
+ specify "not raise if data is correct (numeric account id)" do
58
+ allow(File).to receive(:file?).and_return(true)
59
+ allow(Nfcollector::AttributeValidator).to receive(:validate!).and_return(true)
60
+ expect {
61
+ cp = described_class.new(100, "/file/path", Nfcollector::Attributes::REQUIRED.join(','))
62
+ cp.validate!
63
+ }.to_not raise_error
64
+ end
65
+ end
66
+
67
+ context "Class" do
68
+ specify "raise if not all tokens are provided" do
69
+ expect {
70
+ described_class.parse("10:")
71
+ }.to raise_error(Nfcollector::InvalidCommand)
72
+ end
73
+
74
+ specify "validate! provided tokens" do
75
+ cp = stub(:parser)
76
+ expect(cp).to receive(:validate)
77
+ expect(described_class).to receive(:new).with(10, '/foo/bar', '>a').and_return(cp)
78
+ described_class.parse("10:/foo/bar:>a")
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,95 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nfcollector::CopyFileWriter do
4
+ describe "Adding rows" do
5
+ # TODO: Test too few columns as well? Will this just result in NULLs?
6
+ specify "ignore additional data if a row is added that has more columns set in the constructor" do
7
+ expect {
8
+ copy = described_class.new(%w(created_at username bytes), 10, 0)
9
+ copy.add_row([Time.now, "daniel", 1000, "foo"])
10
+ }.to_not raise_error
11
+ end
12
+ end
13
+
14
+ describe "Table Name" do
15
+ # be correct for the given date" do
16
+ pending "Something needs to happen here but I don't know what"
17
+ end
18
+
19
+ describe "File Name" do
20
+ specify "that it is correct" do
21
+ Nfcollector::Configuration.output_dir = "/tmp/"
22
+ date = stub(:date)
23
+ copy = described_class.new(%w(created_at username bytes), 10, 0)
24
+ expect(copy).to receive(:table_name).and_return("mocked_table_name")
25
+ allow(copy).to receive(:randstr).and_return('randstr')
26
+ expect(copy.file_name(Date.today)).to eq("/tmp/mocked_table_name_randstr.copy")
27
+ end
28
+ end
29
+
30
+ describe "Writing" do
31
+ before do
32
+ allow_any_instance_of(described_class).to receive(:table_name).and_return('mocked_table_name')
33
+ Timecop.freeze(Time.new(2011, 7, 11, 0, 0, 0, "+10:00"))
34
+ end
35
+
36
+ after do
37
+ Timecop.return
38
+ end
39
+
40
+ specify "write the file" do
41
+ copy = described_class.new(%w(created_at username bytes), 10, 0)
42
+ copy.add_row([Time.now.utc, "daniel", 1000])
43
+ io = StringIO.new
44
+ expect(File).to receive(:open).and_yield(io)
45
+ allow(copy).to receive(:randstr).and_return('randstr')
46
+ expect(FileUtils).to receive(:mv).with('/tmp/mocked_table_name_randstr.copy.lock', '/tmp/mocked_table_name_randstr.copy')
47
+ copy.write
48
+ expect(io.string).to eq <<-COPY
49
+ -- Created at: 2011-07-10 14:00:00 UTC
50
+ COPY mocked_table_name (created_at,username,bytes) FROM stdin WITH csv;
51
+ "2011-07-10 14:00:00 UTC","daniel","1000"
52
+ COPY
53
+ end
54
+
55
+ specify "write the file and handle NULLs" do
56
+ copy = described_class.new(%w(created_at username bytes user_group), 10, 0)
57
+ copy.add_row([Time.now.utc, 'daniel', '1000', nil])
58
+ io = StringIO.new
59
+ expect(File).to receive(:open).and_yield(io)
60
+ expect(copy).to receive(:randstr).and_return('randstr')
61
+ expect(FileUtils).to receive(:mv).with('/tmp/mocked_table_name_randstr.copy.lock', '/tmp/mocked_table_name_randstr.copy')
62
+ copy.write
63
+ expect(io.string).to eq <<-COPY
64
+ -- Created at: 2011-07-10 14:00:00 UTC
65
+ COPY mocked_table_name (created_at,username,bytes,user_group) FROM stdin WITH csv;
66
+ "2011-07-10 14:00:00 UTC","daniel","1000",""
67
+ COPY
68
+ end
69
+
70
+ specify "delete the resultant file if there is an exception" do
71
+ copy = described_class.new(%w(created_at username bytes user_group), 10, 0)
72
+ copy.add_row([Time.now, 'daniel', '1000', nil])
73
+ expect(CSV).to receive(:new).and_raise(RuntimeError)
74
+ allow(File).to receive(:file?).and_return(true)
75
+ expect(File).to receive(:delete)
76
+ expect { copy.write }.to raise_error(RuntimeError)
77
+ end
78
+
79
+ specify "raise if there are no rows" do
80
+ copy = described_class.new(%w(created_at username bytes user_group), 10, 0)
81
+ expect { copy.write }.to raise_error(Nfcollector::FileEmpty)
82
+ end
83
+
84
+ # TODO: This is not tested effectively
85
+ specify "return false if a new copy file should be generated (date barrier)" do
86
+ copy = described_class.new(%w(created_at username bytes), 10, 0)
87
+ copy.add_row([Time.gm(2011, 7, 11, 0, 0, 0), 'daniel', '1000'])
88
+ copy.add_row([Time.gm(2011, 7, 12, 0, 0, 0), 'daniel', '1000'])
89
+ allow(copy).to receive(:randstr).and_return('randstr')
90
+ expect(FileUtils).to receive(:mv).with('/tmp/mocked_table_name_randstr.copy.lock', '/tmp/mocked_table_name_randstr.copy').twice
91
+ expect(File).to receive(:open).twice
92
+ copy.write
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,18 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nfcollector::InputDefinition do
4
+ describe '#set' do
5
+ let(:definition) { described_class.new }
6
+ before { definition.set(1, :username) }
7
+
8
+ specify do
9
+ expect(definition.column_index(:username)).to eq(1)
10
+ end
11
+
12
+ specify 'that an error is raised if there is no index for the column' do
13
+ expect {
14
+ definition.column_index(:foo)
15
+ }.to raise_error(Nfcollector::InputDefinition::MissingDefinition)
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,51 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nfcollector::CategoryPartitioner do
4
+ let(:partitioner) { described_class.new(2) }
5
+
6
+ describe '#partition_id' do
7
+ subject { partitioner.partition_id(row) }
8
+
9
+ context 'both columns available' do
10
+ let(:row) { %w(1000 100 10) }
11
+ specify { expect(subject).to eq('10') }
12
+ end
13
+ end
14
+
15
+ describe '#add_row' do
16
+ let(:row) { %w(1000 100 10) }
17
+ subject { partitioner }
18
+
19
+ before { subject.add_row(row) }
20
+
21
+ specify do
22
+ expect(subject.data).to eq({
23
+ '10' => [row]
24
+ })
25
+ end
26
+
27
+ describe 'adding a second row with the same partition id' do
28
+ let(:row2) { %w(1001 100 10) }
29
+ before { subject.add_row(row2) }
30
+
31
+ specify do
32
+ expect(subject.data).to eq({
33
+ '10' => [row, row2]
34
+ })
35
+ end
36
+ end
37
+
38
+ describe 'adding a second row with the a different partition id' do
39
+ let(:row3) { %w(1001 100 11) }
40
+ before { subject.add_row(row3) }
41
+
42
+ specify do
43
+ expect(subject.data).to eq({
44
+ '10' => [row],
45
+ '11' => [row3]
46
+ })
47
+ end
48
+ end
49
+
50
+ end
51
+ end
@@ -0,0 +1,19 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nfcollector::DatePartitioner do
4
+ let(:partitioner) { described_class.new(2) }
5
+
6
+ describe '#partition_id' do
7
+ subject { partitioner.partition_id(row) }
8
+
9
+ context 'the timestamp in UTC is on the same day' do
10
+ let(:row) { [ 1000, 100, '2014-06-16 10:29:52 +1000'.to_time ] }
11
+ specify { expect(subject.to_s).to eq('2014-06-16') }
12
+ end
13
+
14
+ context 'the timestamp in UTC is on the previous day' do
15
+ let(:row) { [ 1000, 100, '2014-06-16 9:29:52 +1000'.to_time ] }
16
+ specify { expect(subject.to_s).to eq('2014-06-15') }
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,32 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nfcollector::InputDefinition do
4
+ let(:input) { described_class.new }
5
+
6
+ describe '#set' do
7
+ before { input.set(0, :username) }
8
+ specify { expect(input.column_index(:username)).to eq(0) }
9
+ end
10
+
11
+ describe '#has_index_for?' do
12
+ context 'no column present' do
13
+ specify { expect(input.has_index_for?(:username)).to be(false) }
14
+ end
15
+
16
+ context 'column is present' do
17
+ before { input.set(0, :username) }
18
+ specify { expect(input.has_index_for?(:username)).to be(true) }
19
+ end
20
+
21
+ context 'mulitple columns present' do
22
+ before do
23
+ input.set(0, :username)
24
+ input.set(1, :client_ip)
25
+ end
26
+
27
+ specify do
28
+ expect(input.has_index_for?([:username, :client_ip])).to be(true)
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,26 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nfcollector::Mapping::ColumnTranspiler do
4
+ let(:output) { Nfcollector::Mapping::Output.new(:username) }
5
+ let(:input_row) { [ '2013-10-10 00:30', 'daniel' ] }
6
+ subject(:column_transpiler) { described_class.new(output) }
7
+
8
+ describe '#go' do
9
+ context 'not yet built' do
10
+ specify do
11
+ expect {
12
+ subject.go(input_row)
13
+ }.to raise_error(Nfcollector::Mapping::ColumnTranspiler::NotYetBuilt)
14
+ end
15
+ end
16
+
17
+ context 'has been built' do
18
+ let(:input_definition) { Nfcollector::Attributes.parse('t,Un') }
19
+ before { subject.build(input_definition) }
20
+
21
+ specify do
22
+ expect(subject.go(input_row)).to eq('daniel')
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,76 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nfcollector::Mapping::Output do
4
+ describe 'class DSL' do
5
+ describe 'base state' do
6
+ before do
7
+ class MyMapperA < Nfcollector::Mapping::Output; end
8
+ end
9
+
10
+ specify { expect(MyMapperA.outputs).to eq([]) }
11
+ end
12
+
13
+ describe 'with outputs defined' do
14
+ before do
15
+ class MyMapperB < Nfcollector::Mapping::Output
16
+ output :created_at
17
+ output :username
18
+ end
19
+ end
20
+
21
+ specify do
22
+ expect(MyMapperB.outputs.size).to eq(2)
23
+ expect(MyMapperB.outputs.first).to be_a(Nfcollector::Mapping::Output)
24
+ end
25
+ end
26
+
27
+ describe '#compile' do
28
+
29
+ end
30
+ end
31
+
32
+ describe '#initialize' do
33
+ context 'defaults' do
34
+ subject { described_class.new(:username) }
35
+
36
+ specify do
37
+ expect(subject.name).to eq(:username)
38
+ expect(subject.inputs).to eq([:username])
39
+ expect(subject.process_with).to eq(:username)
40
+ end
41
+ end
42
+
43
+ context 'one using specific input' do
44
+ subject { described_class.new(:username, inputs: :user) }
45
+ specify { expect(subject.inputs).to eq([:user]) }
46
+ end
47
+
48
+ context 'several specific inputs' do
49
+ subject { described_class.new(:username, inputs: [:user, :group]) }
50
+ specify { expect(subject.inputs).to eq([:user, :group]) }
51
+ end
52
+
53
+ context 'with a custom processing method' do
54
+ subject { described_class.new(:username, process_with: :do_stuff) }
55
+ specify { expect(subject.process_with).to eq(:do_stuff) }
56
+ end
57
+ end
58
+
59
+ describe '#optional' do
60
+ context 'unset' do
61
+ specify { expect(described_class.new(:username)).to_not be_optional }
62
+ end
63
+
64
+ context 'set' do
65
+ specify { expect(described_class.new(:username, optional: true)).to be_optional }
66
+ end
67
+ end
68
+
69
+ describe '#compile' do
70
+ pending
71
+ end
72
+
73
+ describe '#method_missing' do
74
+ pending
75
+ end
76
+ end
@@ -0,0 +1,47 @@
1
+ require 'spec_helper'
2
+
3
+ describe Nfcollector::Mapping::Transpiler do
4
+ # TODO: Rename to add_output
5
+ describe '#add_column' do
6
+ let(:transpiler) { described_class.new(input_definition) }
7
+ let(:input_definition) { Nfcollector::Attributes.parse('>a,Rh,Rp') }
8
+ subject { transpiler.add_column(output) }
9
+
10
+ context 'input definition has index for column' do
11
+ let(:output) { Nfcollector::Mapping::Output.new(:client_ip) }
12
+
13
+ specify 'that we add an output' do
14
+ expect { subject }.to change { transpiler.outputs.size }.by(1)
15
+ end
16
+
17
+ specify 'that the added output is valid column transpiler' do
18
+ subject
19
+ expect(transpiler.outputs[0]).to be_a(Nfcollector::Mapping::ColumnTranspiler)
20
+ end
21
+ end
22
+
23
+ context 'input definition does not have index for column' do
24
+ context 'and output is required' do
25
+ let(:output) { Nfcollector::Mapping::Output.new(:username) }
26
+
27
+ specify 'that we DO NOT add an output' do
28
+ expect {
29
+ subject
30
+ }.to raise_error(Nfcollector::InputDefinition::MissingDefinition)
31
+ end
32
+ end
33
+
34
+ context 'and output is not required' do
35
+ let(:output) { Nfcollector::Mapping::Output.new(:username, optional: true) }
36
+
37
+ specify 'that we DO NOT add an output' do
38
+ expect { subject }.to_not change { transpiler.outputs.size }
39
+ end
40
+ end
41
+ end
42
+ end
43
+
44
+ describe '#transpile' do
45
+ pending
46
+ end
47
+ end