rflow-components-file 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in rflow-components-file.gemspec
4
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1,16 @@
1
+ require 'bundler'
2
+ require 'rspec/core/rake_task'
3
+ require 'rake/rdoctask'
4
+ Bundler::GemHelper.install_tasks
5
+
6
+ RSpec::Core::RakeTask.new(:spec) do |t|
7
+ t.verbose = true
8
+ t.rspec_opts = '--tty --color'
9
+ end
10
+
11
+ Rake::RDocTask.new do |rd|
12
+ rd.main = "README"
13
+ rd.rdoc_files.include("README", "lib/**/*.rb")
14
+ rd.rdoc_dir = File.join('doc', 'html')
15
+ end
16
+
@@ -0,0 +1,95 @@
1
+ require 'rflow/component'
2
+
3
+ require 'digest/md5'
4
+
5
+ class RFlow
6
+ module Components
7
+ module File
8
+ class DirectoryWatcher < RFlow::Component
9
+ output_port :file_port
10
+ output_port :raw_port
11
+
12
+ DEFAULT_CONFIG = {
13
+ 'directory_path' => '/tmp/import',
14
+ 'file_name_glob' => '*',
15
+ 'poll_interval' => 1,
16
+ 'files_per_poll' => 1,
17
+ 'remove_files' => true,
18
+ }
19
+
20
+ attr_accessor :config, :poll_interval, :directory_path, :file_name_glob, :remove_files
21
+
22
+ def configure!(config)
23
+ @config = DEFAULT_CONFIG.merge config
24
+ @directory_path = ::File.expand_path(@config['directory_path'])
25
+ @file_name_glob = @config['file_name_glob']
26
+ @poll_interval = @config['poll_interval'].to_i
27
+ @files_per_poll = @config['files_per_poll'].to_i
28
+ @remove_files = to_boolean(@config['remove_files'])
29
+
30
+ unless ::File.directory?(@directory_path)
31
+ raise ArgumentError, "Invalid directory '#{@directory_path}'"
32
+ end
33
+
34
+ unless ::File.readable?(@directory_path)
35
+ raise ArgumentError, "Unable to read from directory '#{@directory_path}'"
36
+ end
37
+
38
+ # TODO: more error checking of input config
39
+ end
40
+
41
+
42
+ # TODO: optimize sending of messages based on what is connected
43
+ def run!
44
+ timer = EventMachine::PeriodicTimer.new(poll_interval) do
45
+ RFlow.logger.debug "Polling for files in #{::File.join(@directory_path, @file_name_glob)}"
46
+ # Sort by last modified, which will process the earliest
47
+ # modified file first
48
+ file_paths = Dir.glob(::File.join(@directory_path, @file_name_glob)).sort_by {|f| test(?M, f)}
49
+
50
+ file_paths.first(@files_per_poll).each do |file_path|
51
+ RFlow.logger.debug "Importing #{file_path}"
52
+ ::File.open(file_path, 'r:BINARY') do |file|
53
+ file_content = file.read
54
+
55
+ RFlow.logger.debug "read #{file_content.bytesize} bytes of #{file.size} in #{file.path}, md5 #{Digest::MD5.hexdigest(file_content)}"
56
+
57
+ file_message = RFlow::Message.new('RFlow::Message::Data::File')
58
+
59
+ file_message.data.path = ::File.expand_path(file.path)
60
+ file_message.data.size = file.size
61
+ file_message.data.content = file_content
62
+ file_message.data.creation_timestamp = file.ctime
63
+ file_message.data.modification_timestamp = file.mtime
64
+ file_message.data.access_timestamp = file.atime
65
+
66
+ file_port.send_message file_message
67
+
68
+ raw_message = RFlow::Message.new('RFlow::Message::Data::Raw')
69
+ raw_message.data.raw = file_content
70
+ raw_port.send_message raw_message
71
+ end
72
+
73
+ if @remove_files
74
+ RFlow.logger.debug "Removing #{::File.join(@directory_path, file_path)}"
75
+ ::File.delete file_path
76
+ end
77
+ end
78
+ end
79
+ end
80
+
81
+ def to_boolean(string)
82
+ case string
83
+ when /^true$/i, '1', true
84
+ true
85
+ when /^false/i, '0', false
86
+ false
87
+ else
88
+ raise ArgumentError, "'#{string}' cannot be coerced to a boolean value"
89
+ end
90
+ end
91
+
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,58 @@
1
+ class RFlow
2
+ module Components
3
+ module File
4
+
5
+ # The set of extensions to add capability to File data types
6
+ module Extensions
7
+
8
+ # Need to be careful when extending to not clobber data already in data_object
9
+ module FileExtension
10
+ def self.extended(base_data)
11
+ base_data.data_object ||= {
12
+ 'path' => '/', 'size' => 0, 'content' => '',
13
+ 'creation_timestamp' => nil, 'modification_timestamp' => nil, 'access_timestamp' => nil
14
+ }
15
+ end
16
+
17
+ # Default/string accessors
18
+ ['path', 'content'].each do |name|
19
+ define_method name do |*args|
20
+ data_object[name]
21
+ end
22
+ define_method :"#{name}=" do |*args|
23
+ data_object[name] = args.first
24
+ end
25
+ end
26
+
27
+ # Integer Accessors
28
+ ['size'].each do |name|
29
+ define_method name do |*args|
30
+ data_object[name]
31
+ end
32
+ define_method :"#{name}=" do |*args|
33
+ data_object[name] = args.first.to_i
34
+ end
35
+ end
36
+
37
+ # Timestamp Accessors. Note, the precision of the
38
+ # XMLTimestamp is set to 9 digits, meaning that the time you
39
+ # put in might be slightly different from the time you read
40
+ # out.
41
+ ['creation_timestamp', 'modification_timestamp', 'access_timestamp'].each do |name|
42
+ define_method name do |*args|
43
+ data_object[name] ? Time.xmlschema(data_object[name]) : nil
44
+ end
45
+ define_method :"#{name}=" do |*args|
46
+ if args.first.is_a? Time
47
+ data_object[name] = args.first.xmlschema(9)
48
+ else
49
+ data_object[name] = args.first
50
+ end
51
+ end
52
+ end
53
+
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,90 @@
1
+ require 'eventmachine'
2
+ require 'rflow/component'
3
+
4
+ require 'digest/md5'
5
+
6
+ class RFlow
7
+ module Components
8
+ module File
9
+ class OutputRawToFiles < RFlow::Component
10
+ input_port :raw_port
11
+
12
+ DEFAULT_CONFIG = {
13
+ 'directory_path' => '/tmp',
14
+ 'file_name_prefix' => 'output.',
15
+ 'file_name_suffix' => '.out',
16
+ }
17
+
18
+ attr_accessor :config, :directory_path, :file_name_prefix, :file_name_suffix
19
+
20
+
21
+ def configure!(config)
22
+ @config = DEFAULT_CONFIG.merge config
23
+ @directory_path = ::File.expand_path(@config['directory_path'])
24
+ @file_name_prefix = @config['file_name_prefix']
25
+ @file_name_suffix = @config['file_name_suffix']
26
+
27
+ unless ::File.directory?(@directory_path)
28
+ raise ArgumentError, "Invalid directory '#{@directory_path}'"
29
+ end
30
+
31
+ unless ::File.writable?(@directory_path)
32
+ raise ArgumentError, "Unable to read from directory '#{@directory_path}'"
33
+ end
34
+
35
+ @output_file_entropy = 0
36
+
37
+ # TODO: more error checking of input config
38
+ end
39
+
40
+
41
+ def process_message(input_port, input_port_key, connection, message)
42
+ return unless message.data_type_name == 'RFlow::Message::Data::Raw'
43
+
44
+ @output_file_entropy = 0
45
+ begin
46
+ final_output_file_name = output_file_name
47
+
48
+ temp_output_file_path = ::File.join(directory_path, ".#{final_output_file_name}")
49
+ final_output_file_path = ::File.join(directory_path, "#{final_output_file_name}")
50
+
51
+ RFlow.logger.debug "#{self.class.name}##{__method__}: Outputting raw message to #{final_output_file_path} (via #{temp_output_file_path}) with #{message.data.raw.bytesize} bytes and md5 #{Digest::MD5.hexdigest message.data.raw}"
52
+
53
+ ::File.open(temp_output_file_path, ::File::CREAT|::File::EXCL|::File::RDWR, 0644) do |file|
54
+ file.flock(::File::LOCK_EX)
55
+ file.write(message.data.raw)
56
+ end
57
+ ::File.rename(temp_output_file_path, final_output_file_path)
58
+ RFlow.logger.debug "#{self.class.name}##{__method__}: Succesfully output raw message to #{final_output_file_path}"
59
+ rescue Errno::EEXIST => e
60
+ RFlow.logger.debug("#{self.class.name}##{__method__}: File #{temp_output_file_path} exists, increasing entropy")
61
+ retry
62
+ end
63
+
64
+ final_output_file_path
65
+ end
66
+
67
+
68
+ private
69
+
70
+
71
+ def output_file_name
72
+ "#{file_name_prefix}#{current_timestamp}-#{output_file_entropy}#{file_name_suffix}"
73
+ end
74
+
75
+
76
+ def output_file_entropy
77
+ @output_file_entropy += 1
78
+ sprintf("%04d", @output_file_entropy)
79
+ end
80
+
81
+
82
+ def current_timestamp
83
+ time = Time.now
84
+ time.utc.strftime("%Y%m%d_%H%M%S.") + "%06d" % time.utc.usec
85
+ end
86
+
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,7 @@
1
+ class RFlow
2
+ module Components
3
+ module File
4
+ VERSION = "0.0.5"
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,26 @@
1
+ require 'rflow/components/file/extensions'
2
+ require 'rflow/components/file/directory_watcher'
3
+ require 'rflow/components/file/output_raw_to_files'
4
+
5
+ class RFlow
6
+ module Components
7
+ module File
8
+ # Load the schemas
9
+ SCHEMA_DIRECTORY = ::File.expand_path(::File.join(::File.dirname(__FILE__), '..', '..', '..', 'schema'))
10
+
11
+ SCHEMA_FILES = {
12
+ 'file.avsc' => 'RFlow::Message::Data::File',
13
+ }
14
+
15
+ SCHEMA_FILES.each do |file_name, data_type_name|
16
+ schema_string = ::File.read(::File.join(SCHEMA_DIRECTORY, file_name))
17
+ RFlow::Configuration.add_available_data_type data_type_name, 'avro', schema_string
18
+ end
19
+
20
+ # Load the data extensions
21
+ RFlow::Configuration.add_available_data_extension('RFlow::Message::Data::File',
22
+ RFlow::Components::File::Extensions::FileExtension)
23
+
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,2 @@
1
+ require 'rflow'
2
+ require 'rflow/components/file'
@@ -0,0 +1,28 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "rflow/components/file/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "rflow-components-file"
7
+ s.version = RFlow::Components::File::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.required_ruby_version = '~> 1.9'
10
+ s.authors = ["Michael L. Artz"]
11
+ s.email = ["michael.artz@redjack.com"]
12
+ s.homepage = ""
13
+ s.summary = %q{Components that operate on files for the RFlow FBP framework}
14
+ s.description = %q{Components that operate on files for the RFlow FBP framework. Also includes the File schema}
15
+
16
+ s.rubyforge_project = "rflow-components-file"
17
+
18
+ s.files = `git ls-files`.split("\n")
19
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
21
+ s.require_paths = ["lib"]
22
+
23
+ s.add_dependency 'rflow', '~> 0.0'
24
+ s.add_dependency 'eventmachine_httpserver', '~> 0.2'
25
+
26
+ s.add_development_dependency 'rspec', '~> 2.6'
27
+ s.add_development_dependency 'rake', '~> 0.8'
28
+ end
data/schema/file.avsc ADDED
@@ -0,0 +1,14 @@
1
+ {
2
+ "type": "record",
3
+ "name": "File",
4
+ "namespace": "org.rflow.message.data",
5
+ "aliases": [],
6
+ "fields": [
7
+ {"name": "path", "type": ["string", "null"]},
8
+ {"name": "size", "type": "long"},
9
+ {"name": "content", "type": "bytes"},
10
+ {"name": "creation_timestamp", "type": ["string", "null"]},
11
+ {"name": "modification_timestamp", "type": ["string", "null"]},
12
+ {"name": "access_timestamp", "type": ["string", "null"]}
13
+ ]
14
+ }
@@ -0,0 +1,4 @@
1
+ require 'spec_helper.rb'
2
+
3
+ describe RFlow::Components::File::DirectoryWatcher do
4
+ end
@@ -0,0 +1,49 @@
1
+ require 'spec_helper.rb'
2
+
3
+ require 'time'
4
+
5
+ describe RFlow::Components::File::Extensions::FileExtension do
6
+ before(:each) do
7
+ @schema_string = RFlow::Configuration.available_data_types['RFlow::Message::Data::File']['avro']
8
+ end
9
+
10
+ it "should add the extension to RFlow::Configuration" do
11
+ RFlow::Configuration.available_data_extensions['RFlow::Message::Data::File'].should include(described_class)
12
+ end
13
+
14
+ it "should set the defaults" do
15
+ file = RFlow::Message.new('RFlow::Message::Data::File')
16
+
17
+ file.data.path.should == '/'
18
+ file.data.size.should == 0
19
+ file.data.content.should == ''
20
+ file.data.creation_timestamp.should == nil
21
+ file.data.modification_timestamp.should == nil
22
+ file.data.access_timestamp.should == nil
23
+ end
24
+
25
+ it "should correctly use integers or strings for size field" do
26
+ file = RFlow::Message.new('RFlow::Message::Data::File')
27
+
28
+ file.data.size.should == 0
29
+ file.data.size = 10
30
+ file.data.size.should == 10
31
+ file.data.size = '20'
32
+ file.data.size == 20
33
+ end
34
+
35
+ it "should correctly use Time or xmlschema strings for timestamp fields" do
36
+ file = RFlow::Message.new('RFlow::Message::Data::File')
37
+
38
+ file.data.creation_timestamp.should == nil
39
+ now = Time.now
40
+
41
+ file.data.creation_timestamp = now
42
+ file.data.creation_timestamp.should == Time.xmlschema(now.xmlschema(9))
43
+
44
+ file.data.creation_timestamp = now.xmlschema
45
+ file.data.creation_timestamp.should == Time.xmlschema(now.xmlschema)
46
+ end
47
+
48
+
49
+ end
@@ -0,0 +1,23 @@
1
+ require 'spec_helper.rb'
2
+
3
+ describe RFlow::Components::File::OutputRawToFiles do
4
+
5
+ it "should correctly process file name prefix/suffix" do
6
+ component = described_class.new(1)
7
+ component.configure!('file_name_prefix' => 'boom', 'file_name_suffix' => 'town', 'directory_path' => '/tmp')
8
+ component.send(:output_file_name).should match(/boom.*0001town/)
9
+ end
10
+
11
+ it "should do stuff" do
12
+ component = described_class.new(1)
13
+ component.configure!('file_name_prefix' => 'boom.', 'file_name_suffix' => '.town', 'directory_path' => '/tmp')
14
+
15
+ message = RFlow::Message.new('RFlow::Message::Data::Raw')
16
+ message.data.raw = 'boomertown'
17
+
18
+ output_file_path = component.process_message nil, nil, nil, message
19
+
20
+ File.exist?(output_file_path).should be_true
21
+ end
22
+
23
+ end
@@ -0,0 +1,28 @@
1
+ require 'spec_helper.rb'
2
+
3
+ describe 'RFlow::Message::Data::File Avro Schema' do
4
+ before(:each) do
5
+ @schema_string = RFlow::Configuration.available_data_types['RFlow::Message::Data::File']['avro']
6
+ end
7
+
8
+ it "should encode and decode an object" do
9
+ file = {
10
+ 'path' => '/full/file/path/filename',
11
+ 'size' => 1,
12
+ 'content' => 'CONTENT',
13
+ 'creation_timestamp' => 'CREATEDTIMESTRING',
14
+ 'modification_timestamp' => 'MODIFIEDTIMESTRING',
15
+ 'access_timestamp' => 'ACCESSEDTIMESTRING'
16
+ }
17
+
18
+ expect {encode_avro(@schema_string, file)}.to_not raise_error
19
+ avro_encoded_file = encode_avro(@schema_string, file)
20
+
21
+ expect {decode_avro(@schema_string, avro_encoded_file)}.to_not raise_error
22
+ decoded_file = decode_avro(@schema_string, avro_encoded_file)
23
+
24
+ decoded_file.should == file
25
+
26
+ end
27
+ end
28
+
@@ -0,0 +1,18 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib', 'rflow-components-file'))
2
+
3
+ require 'logger'
4
+
5
+ RFlow.logger = Logger.new STDOUT
6
+
7
+ def decode_avro(schema_string, serialized_object)
8
+ schema = Avro::Schema.parse(schema_string)
9
+ sio = StringIO.new(serialized_object)
10
+ Avro::IO::DatumReader.new(schema, schema).read Avro::IO::BinaryDecoder.new(sio)
11
+ end
12
+
13
+ def encode_avro(schema_string, object)
14
+ schema = Avro::Schema.parse(schema_string)
15
+ sio = StringIO.new
16
+ Avro::IO::DatumWriter.new(schema).write object, Avro::IO::BinaryEncoder.new(sio)
17
+ sio.string
18
+ end
metadata ADDED
@@ -0,0 +1,131 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rflow-components-file
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.5
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Michael L. Artz
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-03-16 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rflow
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '0.0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '0.0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: eventmachine_httpserver
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: '0.2'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: '0.2'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rspec
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: '2.6'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: '2.6'
62
+ - !ruby/object:Gem::Dependency
63
+ name: rake
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ~>
68
+ - !ruby/object:Gem::Version
69
+ version: '0.8'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ version: '0.8'
78
+ description: Components that operate on files for the RFlow FBP framework. Also includes
79
+ the File schema
80
+ email:
81
+ - michael.artz@redjack.com
82
+ executables: []
83
+ extensions: []
84
+ extra_rdoc_files: []
85
+ files:
86
+ - .gitignore
87
+ - Gemfile
88
+ - Rakefile
89
+ - lib/rflow-components-file.rb
90
+ - lib/rflow/components/file.rb
91
+ - lib/rflow/components/file/directory_watcher.rb
92
+ - lib/rflow/components/file/extensions.rb
93
+ - lib/rflow/components/file/output_raw_to_files.rb
94
+ - lib/rflow/components/file/version.rb
95
+ - rflow-components-file.gemspec
96
+ - schema/file.avsc
97
+ - spec/directory_watcher_spec.rb
98
+ - spec/extensions_spec.rb
99
+ - spec/output_raw_to_files_spec.rb
100
+ - spec/schema_spec.rb
101
+ - spec/spec_helper.rb
102
+ homepage: ''
103
+ licenses: []
104
+ post_install_message:
105
+ rdoc_options: []
106
+ require_paths:
107
+ - lib
108
+ required_ruby_version: !ruby/object:Gem::Requirement
109
+ none: false
110
+ requirements:
111
+ - - ~>
112
+ - !ruby/object:Gem::Version
113
+ version: '1.9'
114
+ required_rubygems_version: !ruby/object:Gem::Requirement
115
+ none: false
116
+ requirements:
117
+ - - ! '>='
118
+ - !ruby/object:Gem::Version
119
+ version: '0'
120
+ requirements: []
121
+ rubyforge_project: rflow-components-file
122
+ rubygems_version: 1.8.24
123
+ signing_key:
124
+ specification_version: 3
125
+ summary: Components that operate on files for the RFlow FBP framework
126
+ test_files:
127
+ - spec/directory_watcher_spec.rb
128
+ - spec/extensions_spec.rb
129
+ - spec/output_raw_to_files_spec.rb
130
+ - spec/schema_spec.rb
131
+ - spec/spec_helper.rb