embulk-filter-azure_text_analytics 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: cac630a471bf5e67125d08e80f149d46eeccfcc3
4
+ data.tar.gz: f1194a27bd82b07d1571fc555045e04c2abc96d2
5
+ SHA512:
6
+ metadata.gz: fdc16277d95fad436d178c78cc391d89e948ebf39236440fa9263ee6c992e4e3ee953ddcd5aa3f414bf168e31edbcbea02d077cd3008246b51d21a09f9c157bf
7
+ data.tar.gz: 73f376ee3f09ec3ad0f355108f7c5a922d28d8d28b19ae84c9843164532aa55e22aa93159fa97e0e2bcf202ebe8d7d650dda48504d9588846e6d70129c4146f4
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *~
2
+ /pkg/
3
+ /tmp/
4
+ /.bundle/
5
+ /Gemfile.lock
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ jruby-9.1.5.0
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org/'
2
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+
2
+ MIT License
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,90 @@
1
+ # Azure Text Analytics filter plugin for Embulk
2
+
3
+ Azure Text Analytics filter plugin for Embulk.
4
+
5
+ ## Azure Text Analytics Documentation
6
+
7
+ * [Microsoft Cognitive Services \- Documentation](https://www.microsoft.com/cognitive-services/en-us/text-analytics/documentation)
8
+ * [azure\-docs/cognitive\-services\-text\-analytics\-quick\-start\.md at master · Microsoft/azure\-docs](https://github.com/Microsoft/azure-docs/blob/master/articles/cognitive-services/cognitive-services-text-analytics-quick-start.md)
9
+
10
+ ## Overview
11
+
12
+ * **Plugin type**: filter
13
+
14
+ ## Configuration
15
+
16
+ - **api_type**: api_type(string),
17
+ - **language**: language(string, default: nil),
18
+ - **out_key_name**: out_key_name(string),
19
+ - **key_name**: key_name(string),
20
+ - **body_params**: body_params(hash, default: {}),
21
+ - **params**: params(hash, default: {}),
22
+ - **delay**: delay(integer, default: 0),
23
+ - **per_request**: per_request(integer, default: 1),
24
+ - **bulk_size**: bulk_size(integer, default: 100),
25
+ - **subscription_key**: subscription_key(string),
26
+
27
+ ## Example
28
+ ### sentiment
29
+
30
+ ```yaml
31
+ # en,es,fr,pt
32
+ - type: azure_text_analytics
33
+ api_type: sentiment
34
+ key_name: target_key
35
+ out_key_name: target_key_sentiment
36
+ language: en
37
+ delay: 2
38
+ subscription_key: XXXXXXXXXXXXXXXXXXXXXXXXXXX
39
+ ```
40
+
41
+ * sentiment support language
42
+ * en
43
+ * es
44
+ * fr
45
+ * pt
46
+
47
+
48
+ ### languages
49
+
50
+ ```yaml
51
+ - type: azure_text_analytics
52
+ api_type: languages
53
+ out_key_name: target_key_languages
54
+ language: en
55
+ key_name: target_key
56
+ delay: 2
57
+ subscription_key: XXXXXXXXXXXXXXXXXXXXXXXXXXX
58
+ ```
59
+
60
+ ### keyPhrases
61
+
62
+ ```yaml
63
+ - type: azure_text_analytics
64
+ api_type: keyPhrases
65
+ out_key_name: target_key_keyPhrases
66
+ key_name: target_key
67
+ delay: 2
68
+ subscription_key: XXXXXXXXXXXXXXXXXXXXXXXXXXX
69
+ ```
70
+
71
+ ### keyPhrases
72
+
73
+ ```yaml
74
+ # en,es,fr,pt
75
+ - type: azure_text_analytics_topics
76
+ out_key_name: _parsed
77
+ key_name: pr
78
+ params:
79
+ minDocumentsPerWord: 3
80
+ maxDocumentsPerWord: 10
81
+ subscription_key: {{ env.AZURE_TEXT_SUBSCRIPTION_KEY }}
82
+
83
+ ```
84
+ * required, over 100 documents.
85
+
86
+ ## Build
87
+
88
+ ```
89
+ $ rake
90
+ ```
data/Rakefile ADDED
@@ -0,0 +1,3 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ task default: :build
@@ -0,0 +1,19 @@
1
+
2
+ Gem::Specification.new do |spec|
3
+ spec.name = "embulk-filter-azure_text_analytics"
4
+ spec.version = "0.1.0"
5
+ spec.authors = ["toyama0919"]
6
+ spec.summary = "Azure Text Analytics filter plugin for Embulk"
7
+ spec.description = "Azure Text Analytics"
8
+ spec.email = ["toyama0919@gmail.com"]
9
+ spec.licenses = ["MIT"]
10
+ spec.homepage = "https://github.com/toyama0919/embulk-filter-azure_text_analytics"
11
+
12
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
13
+ spec.test_files = spec.files.grep(%r{^(test|spec)/})
14
+ spec.require_paths = ["lib"]
15
+
16
+ spec.add_development_dependency 'embulk', ['>= 0.8.15']
17
+ spec.add_development_dependency 'bundler', ['>= 1.10.6']
18
+ spec.add_development_dependency 'rake', ['>= 10.0']
19
+ end
@@ -0,0 +1,114 @@
1
+ require "json"
2
+ require "net/http"
3
+ require "uri"
4
+
5
+ module Embulk
6
+ module Filter
7
+
8
+ class AzureTextAnalytics < FilterPlugin
9
+ Plugin.register_filter("azure_text_analytics", self)
10
+ ENDPOINT_PREFIX = "https://westus.api.cognitive.microsoft.com/text/analytics/v2.0"
11
+
12
+ def self.transaction(config, in_schema, &control)
13
+ task = {
14
+ "api_type" => config.param("api_type", :string),
15
+ "language" => config.param("language", :string, default: nil),
16
+ "out_key_name" => config.param("out_key_name", :string),
17
+ "key_name" => config.param("key_name", :string),
18
+ "body_params" => config.param("body_params", :hash, default: {}),
19
+ "params" => config.param("params", :hash, default: {}),
20
+ "delay" => config.param("delay", :integer, default: 0),
21
+ "per_request" => config.param("per_request", :integer, default: 1),
22
+ "bulk_size" => config.param("bulk_size", :integer, default: 100),
23
+ "subscription_key" => config.param("subscription_key", :string),
24
+ }
25
+
26
+ if task['api_type'] == 'topics'
27
+ raise ConfigError.new "Not support type topics API. use azure_text_analytics_topics."
28
+ end
29
+
30
+ add_columns = [
31
+ Column.new(nil, task["out_key_name"], :json)
32
+ ]
33
+
34
+ out_columns = in_schema + add_columns
35
+
36
+ yield(task, out_columns)
37
+ end
38
+
39
+ def init
40
+ uri_string = "#{ENDPOINT_PREFIX}/#{task['api_type']}"
41
+ @uri = URI.parse(uri_string)
42
+ @uri.query = URI.encode_www_form(task['params'])
43
+ @http = Net::HTTP.new(@uri.host, @uri.port)
44
+ @http.use_ssl = true
45
+ @request = Net::HTTP::Post.new(@uri.request_uri)
46
+ @request['Content-Type'] = 'application/json'
47
+ @request['Ocp-Apim-Subscription-Key'] = task['subscription_key']
48
+
49
+ @body_params = task['body_params']
50
+ @per_request = task['per_request']
51
+ @delay = task['delay']
52
+ @key_name = task['key_name']
53
+ @language = task['language']
54
+ @out_key_name = task['out_key_name']
55
+ @bulk_size = task['bulk_size']
56
+ @records = []
57
+ end
58
+
59
+ def close
60
+ end
61
+
62
+ def add(page)
63
+ page.each do |record|
64
+ @records << Hash[in_schema.names.zip(record)]
65
+ if @records.size == @bulk_size
66
+ proc_records(@records)
67
+ @records = []
68
+ end
69
+ end
70
+ end
71
+
72
+ def finish
73
+ if @records.size > 0
74
+ proc_records(@records)
75
+ end
76
+ page_builder.finish
77
+ end
78
+
79
+ private
80
+ def proc_records(records)
81
+ documents = {}
82
+ records.each do |record|
83
+ document = {}
84
+ uuid = SecureRandom.uuid
85
+ document["language"] = @language if @language
86
+ document["id"] = uuid
87
+ document["text"] = record[@key_name]
88
+ documents[uuid] = document
89
+ end
90
+
91
+ @request.body = @body_params.merge({ documents: documents.values }).to_json
92
+ Embulk.logger.debug "request body => #{@request.body}"
93
+ response_hash = @http.start do |h|
94
+ response = h.request(@request)
95
+ JSON.parse(response.body)
96
+ end
97
+
98
+ if response_hash.key?('innerError')
99
+ Embulk.logger.error "response body => #{response_hash}"
100
+ end
101
+
102
+ records.each_with_index do |record, i|
103
+ record[@out_key_name] = if response_hash.key?('innerError')
104
+ response_hash
105
+ else
106
+ response_hash['documents'][i]
107
+ end
108
+ page_builder.add(record.values)
109
+ end
110
+ sleep @delay
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,128 @@
1
+ require "json"
2
+ require "net/http"
3
+ require "uri"
4
+ require "pp"
5
+
6
+ module Embulk
7
+ module Filter
8
+
9
+ class AzureTextAnalyticsTopics < FilterPlugin
10
+ Plugin.register_filter("azure_text_analytics_topics", self)
11
+ ENDPOINT_PREFIX = "https://westus.api.cognitive.microsoft.com/text/analytics/v2.0"
12
+
13
+ def self.transaction(config, in_schema, &control)
14
+ task = {
15
+ "language" => config.param("language", :string, default: nil),
16
+ "out_key_name" => config.param("out_key_name", :string),
17
+ "key_name" => config.param("key_name", :string),
18
+ "body_params" => config.param("body_params", :hash, default: {}),
19
+ "params" => config.param("params", :hash, default: {}),
20
+ "subscription_key" => config.param("subscription_key", :string),
21
+ "stop_words" => config.param("stop_words", :array, default: nil),
22
+ "stop_phrases" => config.param("stop_phrases", :array, default: nil),
23
+ }
24
+
25
+ out_columns = [
26
+ Column.new(nil, task["out_key_name"], :json)
27
+ ]
28
+
29
+ yield(task, out_columns)
30
+ end
31
+
32
+ def init
33
+ @subscription_key = task['subscription_key']
34
+ @body_params = task['body_params']
35
+ @key_name = task['key_name']
36
+ @language = task['language']
37
+ @out_key_name = task['out_key_name']
38
+ @stop_words = task['stop_words']
39
+ @stop_phrases = task['stop_phrases']
40
+
41
+ uri_string = "#{ENDPOINT_PREFIX}/topics"
42
+ @uri = URI.parse(uri_string)
43
+ @uri.query = URI.encode_www_form(task['params'])
44
+ @http = Net::HTTP.new(@uri.host, @uri.port)
45
+ @http.use_ssl = true
46
+
47
+ @request = Net::HTTP::Post.new(@uri.request_uri)
48
+ @request['Content-Type'] = 'application/json'
49
+ @request['Ocp-Apim-Subscription-Key'] = @subscription_key
50
+
51
+ @records = []
52
+ end
53
+
54
+ def close
55
+ end
56
+
57
+ def add(page)
58
+ page.each do |record|
59
+ @records << Hash[in_schema.names.zip(record)]
60
+ end
61
+ end
62
+
63
+ def finish
64
+ if @records.size > 0
65
+ proc_records(@records)
66
+ end
67
+ page_builder.finish
68
+ end
69
+
70
+ private
71
+ def proc_records(records)
72
+ documents = []
73
+ records.each do |record|
74
+ document = {}
75
+ document_id = SecureRandom.uuid
76
+ document["language"] = @language if @language
77
+ document["id"] = document_id
78
+ document["text"] = record[@key_name]
79
+ documents << document
80
+ end
81
+
82
+ request_body_hash = @body_params.merge({ documents: documents })
83
+ request_body_hash[:stopWords] = @stop_words if @stop_words
84
+ request_body_hash[:stopPhrases] = @stop_phrases if @stop_phrases
85
+
86
+ @request.body = request_body_hash.to_json
87
+ Embulk.logger.debug "request body => #{@request.body}"
88
+ response_hash = @http.start do |h|
89
+ proc_http(h)
90
+ end
91
+
92
+ topics = response_hash["operationProcessingResult"]["topics"]
93
+
94
+ topics.each do |data|
95
+ page_builder.add([data])
96
+ end
97
+ end
98
+
99
+ def proc_http(h)
100
+ response = h.request(@request)
101
+ operation_location = response['operation-location']
102
+
103
+ if operation_location
104
+ Embulk.logger.info "operation_location => #{operation_location}"
105
+ topics_request = Net::HTTP::Get.new(operation_location)
106
+ topics_request['Ocp-Apim-Subscription-Key'] = @subscription_key
107
+ loop do
108
+ topics_response = h.request(topics_request)
109
+ topics_response_body = topics_response.body
110
+ topics_response_hash = JSON.parse(topics_response_body)
111
+ status = topics_response_hash['status']
112
+ Embulk.logger.info "status => #{status}"
113
+ if status == 'Succeeded'
114
+ Embulk.logger.debug "topics_response_hash => #{topics_response_hash}"
115
+ return topics_response_hash
116
+ end
117
+ if status == 'Failed'
118
+ raise "topics_response_hash => #{topics_response_hash}"
119
+ end
120
+ sleep 60
121
+ end
122
+ end
123
+
124
+ JSON.parse(response.body)
125
+ end
126
+ end
127
+ end
128
+ end
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-filter-azure_text_analytics
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - toyama0919
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-02-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: 0.8.15
19
+ name: embulk
20
+ prerelease: false
21
+ type: :development
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 0.8.15
27
+ - !ruby/object:Gem::Dependency
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.10.6
33
+ name: bundler
34
+ prerelease: false
35
+ type: :development
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 1.10.6
41
+ - !ruby/object:Gem::Dependency
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: '10.0'
47
+ name: rake
48
+ prerelease: false
49
+ type: :development
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ description: Azure Text Analytics
56
+ email:
57
+ - toyama0919@gmail.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - ".gitignore"
63
+ - ".ruby-version"
64
+ - Gemfile
65
+ - LICENSE.txt
66
+ - README.md
67
+ - Rakefile
68
+ - embulk-filter-azure_text_analytics.gemspec
69
+ - lib/embulk/filter/azure_text_analytics.rb
70
+ - lib/embulk/filter/azure_text_analytics_topics.rb
71
+ homepage: https://github.com/toyama0919/embulk-filter-azure_text_analytics
72
+ licenses:
73
+ - MIT
74
+ metadata: {}
75
+ post_install_message:
76
+ rdoc_options: []
77
+ require_paths:
78
+ - lib
79
+ required_ruby_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ required_rubygems_version: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ requirements: []
90
+ rubyforge_project:
91
+ rubygems_version: 2.6.6
92
+ signing_key:
93
+ specification_version: 4
94
+ summary: Azure Text Analytics filter plugin for Embulk
95
+ test_files: []