embulk-parser-mahout 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2d007c978b923dcd51d399d4fd3af4f838656e56
4
+ data.tar.gz: cf4c38a4b7e77c77d5df8eee97209dfd7ea14fa2
5
+ SHA512:
6
+ metadata.gz: 175df85396025576190e56c1830e301723618a27bf9bd8b50e6d26f337a25f59023bc752a9d46bc9b570651b6e57c589dc34ab0eaad74d8ccb098464c8a939dc
7
+ data.tar.gz: 2ffc1386778f9e76802b49341b2487a67cdcc9d4ee770116f1d70549504f1e5cc16d94a202c9d9fba4b080b0630c808907875ec3fb3633c15e755dfd77311f65
data/.gitignore ADDED
@@ -0,0 +1,8 @@
1
+ *~
2
+ /pkg/
3
+ /tmp/
4
+ /.bundle/
5
+ /Gemfile.lock
6
+ /.idea
7
+ /*.iml
8
+
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ jruby-9.1.5.0
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org/'
2
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+
2
+ MIT License
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,38 @@
1
+ # Mahout parser plugin for Embulk
2
+
3
+ TODO: Write short description here and embulk-parser-mahout.gemspec file.
4
+
5
+ ## Overview
6
+
7
+ * **Plugin type**: parser
8
+ * **Guess supported**: no
9
+
10
+ ## Configuration
11
+
12
+ - **option1**: description (integer, required)
13
+ - **option2**: description (string, default: `"myvalue"`)
14
+ - **option3**: description (string, default: `null`)
15
+
16
+ ## Example
17
+
18
+ ```yaml
19
+ in:
20
+ type: any file input plugin type
21
+ parser:
22
+ type: mahout
23
+ option1: example1
24
+ option2: example2
25
+ ```
26
+
27
+ (If guess supported) you don't have to write `parser:` section in the configuration file. After writing `in:` section, you can let embulk guess `parser:` section using this command:
28
+
29
+ ```
30
+ $ embulk gem install embulk-parser-mahout
31
+ $ embulk guess -g mahout config.yml -o guessed.yml
32
+ ```
33
+
34
+ ## Build
35
+
36
+ ```
37
+ $ rake
38
+ ```
data/Rakefile ADDED
@@ -0,0 +1,3 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ task default: :build
@@ -0,0 +1,20 @@
1
+
2
+ Gem::Specification.new do |spec|
3
+ spec.name = "embulk-parser-mahout"
4
+ spec.version = "0.0.1"
5
+ spec.authors = ["kihengk"]
6
+ spec.summary = "Mahout parser plugin for Embulk"
7
+ spec.description = "Parses Mahout files read by other file input plugins."
8
+ spec.email = ["kihengk@gmail.com"]
9
+ spec.licenses = ["MIT"]
10
+ spec.homepage = "https://github.com/kihengk/embulk-parser-mahout"
11
+
12
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
13
+ spec.test_files = spec.files.grep(%r{^(test|spec)/})
14
+ spec.require_paths = ["lib"]
15
+
16
+ #spec.add_dependency 'YOUR_GEM_DEPENDENCY', ['~> YOUR_GEM_DEPENDENCY_VERSION']
17
+ spec.add_development_dependency 'embulk', ['>= 0.8.31']
18
+ spec.add_development_dependency 'bundler', ['>= 1.10.6']
19
+ spec.add_development_dependency 'rake', ['>= 10.0']
20
+ end
@@ -0,0 +1,63 @@
1
+ module Embulk
2
+ module Guess
3
+
4
+ # TODO implement guess plugin to make this command work:
5
+ # $ embulk guess -g "mahout" partial-config.yml
6
+ #
7
+ # Depending on the file format the plugin uses, you can use choose
8
+ # one of binary guess (GuessPlugin), text guess (TextGuessPlugin),
9
+ # or line guess (LineGuessPlugin).
10
+
11
+ # require "embulk/parser/mahout.rb"
12
+
13
+ # class Mahout < GuessPlugin
14
+ # Plugin.register_guess("mahout", self)
15
+ #
16
+ # def guess(config, sample_buffer)
17
+ # if sample_buffer[0,2] == GZIP_HEADER
18
+ # guessed = {}
19
+ # guessed["type"] = "mahout"
20
+ # guessed["property1"] = "guessed-value"
21
+ # return {"parser" => guessed}
22
+ # else
23
+ # return {}
24
+ # end
25
+ # end
26
+ # end
27
+
28
+ # class Mahout < TextGuessPlugin
29
+ # Plugin.register_guess("mahout", self)
30
+ #
31
+ # def guess_text(config, sample_text)
32
+ # js = JSON.parse(sample_text) rescue nil
33
+ # if js && js["mykeyword"] == "keyword"
34
+ # guessed = {}
35
+ # guessed["type"] = "mahout"
36
+ # guessed["property1"] = "guessed-value"
37
+ # return {"parser" => guessed}
38
+ # else
39
+ # return {}
40
+ # end
41
+ # end
42
+ # end
43
+
44
+ # class Mahout < LineGuessPlugin
45
+ # Plugin.register_guess("mahout", self)
46
+ #
47
+ # def guess_lines(config, sample_lines)
48
+ # all_line_matched = sample_lines.all? do |line|
49
+ # line =~ /mypattern/
50
+ # end
51
+ # if all_line_matched
52
+ # guessed = {}
53
+ # guessed["type"] = "mahout"
54
+ # guessed["property1"] = "guessed-value"
55
+ # return {"parser" => guessed}
56
+ # else
57
+ # return {}
58
+ # end
59
+ # end
60
+ # end
61
+
62
+ end
63
+ end
@@ -0,0 +1,50 @@
1
+ module Embulk
2
+ module Parser
3
+
4
+ class Mahout < ParserPlugin
5
+ Plugin.register_parser("mahout", self)
6
+
7
+ def self.transaction(config, &control)
8
+ # configuration code:
9
+ task = {
10
+ "command" => config.param("command", :string, default: "recommenditembased"), # integer, required
11
+ "schema" => config.param("schema", :array)
12
+ }
13
+
14
+ columns = task["schema"].each_with_index.map do |col, index|
15
+ Column.new(index, col["name"], col["type"].to_sym)
16
+ end
17
+
18
+ yield(task, columns)
19
+ end
20
+
21
+ def init
22
+ @command = task["command"]
23
+ @col = task["columns"]
24
+ end
25
+
26
+ def run(file_input)
27
+ while file = file_input.next_file
28
+
29
+ text = file.read
30
+ text.each_line do |row|
31
+
32
+ record = []
33
+
34
+ user_id = row.match(/^\d*/)[0]
35
+ record.push(user_id)
36
+ recommend_items = row.scan(/(\d*):(\d*\.\d*)/)
37
+ recommend_items.each do |item|
38
+ record += [item[0], item[1]]
39
+ end
40
+ page_builder.add(record)
41
+
42
+ end
43
+
44
+ end
45
+ page_builder.finish
46
+ end
47
+ end
48
+
49
+ end
50
+ end
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-parser-mahout
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - kihengk
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-09-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: 0.8.31
19
+ name: embulk
20
+ prerelease: false
21
+ type: :development
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 0.8.31
27
+ - !ruby/object:Gem::Dependency
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.10.6
33
+ name: bundler
34
+ prerelease: false
35
+ type: :development
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 1.10.6
41
+ - !ruby/object:Gem::Dependency
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: '10.0'
47
+ name: rake
48
+ prerelease: false
49
+ type: :development
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ description: Parses Mahout files read by other file input plugins.
56
+ email:
57
+ - kihengk@gmail.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - ".gitignore"
63
+ - ".ruby-version"
64
+ - Gemfile
65
+ - LICENSE.txt
66
+ - README.md
67
+ - Rakefile
68
+ - embulk-parser-mahout.gemspec
69
+ - lib/embulk/guess/mahout.rb
70
+ - lib/embulk/parser/mahout.rb
71
+ homepage: https://github.com/kihengk/embulk-parser-mahout
72
+ licenses:
73
+ - MIT
74
+ metadata: {}
75
+ post_install_message:
76
+ rdoc_options: []
77
+ require_paths:
78
+ - lib
79
+ required_ruby_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ required_rubygems_version: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ requirements: []
90
+ rubyforge_project:
91
+ rubygems_version: 2.6.6
92
+ signing_key:
93
+ specification_version: 4
94
+ summary: Mahout parser plugin for Embulk
95
+ test_files: []