embulk-filter-azure_text_analytics 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: cac630a471bf5e67125d08e80f149d46eeccfcc3
4
+ data.tar.gz: f1194a27bd82b07d1571fc555045e04c2abc96d2
5
+ SHA512:
6
+ metadata.gz: fdc16277d95fad436d178c78cc391d89e948ebf39236440fa9263ee6c992e4e3ee953ddcd5aa3f414bf168e31edbcbea02d077cd3008246b51d21a09f9c157bf
7
+ data.tar.gz: 73f376ee3f09ec3ad0f355108f7c5a922d28d8d28b19ae84c9843164532aa55e22aa93159fa97e0e2bcf202ebe8d7d650dda48504d9588846e6d70129c4146f4
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *~
2
+ /pkg/
3
+ /tmp/
4
+ /.bundle/
5
+ /Gemfile.lock
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ jruby-9.1.5.0
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org/'
2
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+
2
+ MIT License
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,90 @@
1
+ # Azure Text Analytics filter plugin for Embulk
2
+
3
+ Azure Text Analytics filter plugin for Embulk.
4
+
5
+ ## Azure Text Analytics Documentation
6
+
7
+ * [Microsoft Cognitive Services \- Documentation](https://www.microsoft.com/cognitive-services/en-us/text-analytics/documentation)
8
+ * [azure\-docs/cognitive\-services\-text\-analytics\-quick\-start\.md at master · Microsoft/azure\-docs](https://github.com/Microsoft/azure-docs/blob/master/articles/cognitive-services/cognitive-services-text-analytics-quick-start.md)
9
+
10
+ ## Overview
11
+
12
+ * **Plugin type**: filter
13
+
14
+ ## Configuration
15
+
16
+ - **api_type**: api_type(string),
17
+ - **language**: language(string, default: nil),
18
+ - **out_key_name**: out_key_name(string),
19
+ - **key_name**: key_name(string),
20
+ - **body_params**: body_params(hash, default: {}),
21
+ - **params**: params(hash, default: {}),
22
+ - **delay**: delay(integer, default: 0),
23
+ - **per_request**: per_request(integer, default: 1),
24
+ - **bulk_size**: bulk_size(integer, default: 100),
25
+ - **subscription_key**: subscription_key(string),
26
+
27
+ ## Example
28
+ ### sentiment
29
+
30
+ ```yaml
31
+ # en,es,fr,pt
32
+ - type: azure_text_analytics
33
+ api_type: sentiment
34
+ key_name: target_key
35
+ out_key_name: target_key_sentiment
36
+ language: en
37
+ delay: 2
38
+ subscription_key: XXXXXXXXXXXXXXXXXXXXXXXXXXX
39
+ ```
40
+
41
+ * sentiment support language
42
+ * en
43
+ * es
44
+ * fr
45
+ * pt
46
+
47
+
48
+ ### languages
49
+
50
+ ```yaml
51
+ - type: azure_text_analytics
52
+ api_type: languages
53
+ out_key_name: target_key_languages
54
+ language: en
55
+ key_name: target_key
56
+ delay: 2
57
+ subscription_key: XXXXXXXXXXXXXXXXXXXXXXXXXXX
58
+ ```
59
+
60
+ ### keyPhrases
61
+
62
+ ```yaml
63
+ - type: azure_text_analytics
64
+ api_type: keyPhrases
65
+ out_key_name: target_key_keyPhrases
66
+ key_name: target_key
67
+ delay: 2
68
+ subscription_key: XXXXXXXXXXXXXXXXXXXXXXXXXXX
69
+ ```
70
+
71
+ ### keyPhrases
72
+
73
+ ```yaml
74
+ # en,es,fr,pt
75
+ - type: azure_text_analytics_topics
76
+ out_key_name: _parsed
77
+ key_name: pr
78
+ params:
79
+ minDocumentsPerWord: 3
80
+ maxDocumentsPerWord: 10
81
+ subscription_key: {{ env.AZURE_TEXT_SUBSCRIPTION_KEY }}
82
+
83
+ ```
84
+ * required, over 100 documents.
85
+
86
+ ## Build
87
+
88
+ ```
89
+ $ rake
90
+ ```
data/Rakefile ADDED
@@ -0,0 +1,3 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ task default: :build
@@ -0,0 +1,19 @@
1
+
2
+ Gem::Specification.new do |spec|
3
+ spec.name = "embulk-filter-azure_text_analytics"
4
+ spec.version = "0.1.0"
5
+ spec.authors = ["toyama0919"]
6
+ spec.summary = "Azure Text Analytics filter plugin for Embulk"
7
+ spec.description = "Azure Text Analytics"
8
+ spec.email = ["toyama0919@gmail.com"]
9
+ spec.licenses = ["MIT"]
10
+ spec.homepage = "https://github.com/toyama0919/embulk-filter-azure_text_analytics"
11
+
12
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
13
+ spec.test_files = spec.files.grep(%r{^(test|spec)/})
14
+ spec.require_paths = ["lib"]
15
+
16
+ spec.add_development_dependency 'embulk', ['>= 0.8.15']
17
+ spec.add_development_dependency 'bundler', ['>= 1.10.6']
18
+ spec.add_development_dependency 'rake', ['>= 10.0']
19
+ end
@@ -0,0 +1,114 @@
1
+ require "json"
2
+ require "net/http"
3
+ require "uri"
4
+
5
+ module Embulk
6
+ module Filter
7
+
8
+ class AzureTextAnalytics < FilterPlugin
9
+ Plugin.register_filter("azure_text_analytics", self)
10
+ ENDPOINT_PREFIX = "https://westus.api.cognitive.microsoft.com/text/analytics/v2.0"
11
+
12
+ def self.transaction(config, in_schema, &control)
13
+ task = {
14
+ "api_type" => config.param("api_type", :string),
15
+ "language" => config.param("language", :string, default: nil),
16
+ "out_key_name" => config.param("out_key_name", :string),
17
+ "key_name" => config.param("key_name", :string),
18
+ "body_params" => config.param("body_params", :hash, default: {}),
19
+ "params" => config.param("params", :hash, default: {}),
20
+ "delay" => config.param("delay", :integer, default: 0),
21
+ "per_request" => config.param("per_request", :integer, default: 1),
22
+ "bulk_size" => config.param("bulk_size", :integer, default: 100),
23
+ "subscription_key" => config.param("subscription_key", :string),
24
+ }
25
+
26
+ if task['api_type'] == 'topics'
27
+ raise ConfigError.new "Not support type topics API. use azure_text_analytics_topics."
28
+ end
29
+
30
+ add_columns = [
31
+ Column.new(nil, task["out_key_name"], :json)
32
+ ]
33
+
34
+ out_columns = in_schema + add_columns
35
+
36
+ yield(task, out_columns)
37
+ end
38
+
39
+ def init
40
+ uri_string = "#{ENDPOINT_PREFIX}/#{task['api_type']}"
41
+ @uri = URI.parse(uri_string)
42
+ @uri.query = URI.encode_www_form(task['params'])
43
+ @http = Net::HTTP.new(@uri.host, @uri.port)
44
+ @http.use_ssl = true
45
+ @request = Net::HTTP::Post.new(@uri.request_uri)
46
+ @request['Content-Type'] = 'application/json'
47
+ @request['Ocp-Apim-Subscription-Key'] = task['subscription_key']
48
+
49
+ @body_params = task['body_params']
50
+ @per_request = task['per_request']
51
+ @delay = task['delay']
52
+ @key_name = task['key_name']
53
+ @language = task['language']
54
+ @out_key_name = task['out_key_name']
55
+ @bulk_size = task['bulk_size']
56
+ @records = []
57
+ end
58
+
59
+ def close
60
+ end
61
+
62
+ def add(page)
63
+ page.each do |record|
64
+ @records << Hash[in_schema.names.zip(record)]
65
+ if @records.size == @bulk_size
66
+ proc_records(@records)
67
+ @records = []
68
+ end
69
+ end
70
+ end
71
+
72
+ def finish
73
+ if @records.size > 0
74
+ proc_records(@records)
75
+ end
76
+ page_builder.finish
77
+ end
78
+
79
+ private
80
+ def proc_records(records)
81
+ documents = {}
82
+ records.each do |record|
83
+ document = {}
84
+ uuid = SecureRandom.uuid
85
+ document["language"] = @language if @language
86
+ document["id"] = uuid
87
+ document["text"] = record[@key_name]
88
+ documents[uuid] = document
89
+ end
90
+
91
+ @request.body = @body_params.merge({ documents: documents.values }).to_json
92
+ Embulk.logger.debug "request body => #{@request.body}"
93
+ response_hash = @http.start do |h|
94
+ response = h.request(@request)
95
+ JSON.parse(response.body)
96
+ end
97
+
98
+ if response_hash.key?('innerError')
99
+ Embulk.logger.error "response body => #{response_hash}"
100
+ end
101
+
102
+ records.each_with_index do |record, i|
103
+ record[@out_key_name] = if response_hash.key?('innerError')
104
+ response_hash
105
+ else
106
+ response_hash['documents'][i]
107
+ end
108
+ page_builder.add(record.values)
109
+ end
110
+ sleep @delay
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,128 @@
1
+ require "json"
2
+ require "net/http"
3
+ require "uri"
4
+ require "pp"
5
+
6
+ module Embulk
7
+ module Filter
8
+
9
+ class AzureTextAnalyticsTopics < FilterPlugin
10
+ Plugin.register_filter("azure_text_analytics_topics", self)
11
+ ENDPOINT_PREFIX = "https://westus.api.cognitive.microsoft.com/text/analytics/v2.0"
12
+
13
+ def self.transaction(config, in_schema, &control)
14
+ task = {
15
+ "language" => config.param("language", :string, default: nil),
16
+ "out_key_name" => config.param("out_key_name", :string),
17
+ "key_name" => config.param("key_name", :string),
18
+ "body_params" => config.param("body_params", :hash, default: {}),
19
+ "params" => config.param("params", :hash, default: {}),
20
+ "subscription_key" => config.param("subscription_key", :string),
21
+ "stop_words" => config.param("stop_words", :array, default: nil),
22
+ "stop_phrases" => config.param("stop_phrases", :array, default: nil),
23
+ }
24
+
25
+ out_columns = [
26
+ Column.new(nil, task["out_key_name"], :json)
27
+ ]
28
+
29
+ yield(task, out_columns)
30
+ end
31
+
32
+ def init
33
+ @subscription_key = task['subscription_key']
34
+ @body_params = task['body_params']
35
+ @key_name = task['key_name']
36
+ @language = task['language']
37
+ @out_key_name = task['out_key_name']
38
+ @stop_words = task['stop_words']
39
+ @stop_phrases = task['stop_phrases']
40
+
41
+ uri_string = "#{ENDPOINT_PREFIX}/topics"
42
+ @uri = URI.parse(uri_string)
43
+ @uri.query = URI.encode_www_form(task['params'])
44
+ @http = Net::HTTP.new(@uri.host, @uri.port)
45
+ @http.use_ssl = true
46
+
47
+ @request = Net::HTTP::Post.new(@uri.request_uri)
48
+ @request['Content-Type'] = 'application/json'
49
+ @request['Ocp-Apim-Subscription-Key'] = @subscription_key
50
+
51
+ @records = []
52
+ end
53
+
54
+ def close
55
+ end
56
+
57
+ def add(page)
58
+ page.each do |record|
59
+ @records << Hash[in_schema.names.zip(record)]
60
+ end
61
+ end
62
+
63
+ def finish
64
+ if @records.size > 0
65
+ proc_records(@records)
66
+ end
67
+ page_builder.finish
68
+ end
69
+
70
+ private
71
+ def proc_records(records)
72
+ documents = []
73
+ records.each do |record|
74
+ document = {}
75
+ document_id = SecureRandom.uuid
76
+ document["language"] = @language if @language
77
+ document["id"] = document_id
78
+ document["text"] = record[@key_name]
79
+ documents << document
80
+ end
81
+
82
+ request_body_hash = @body_params.merge({ documents: documents })
83
+ request_body_hash[:stopWords] = @stop_words if @stop_words
84
+ request_body_hash[:stopPhrases] = @stop_phrases if @stop_phrases
85
+
86
+ @request.body = request_body_hash.to_json
87
+ Embulk.logger.debug "request body => #{@request.body}"
88
+ response_hash = @http.start do |h|
89
+ proc_http(h)
90
+ end
91
+
92
+ topics = response_hash["operationProcessingResult"]["topics"]
93
+
94
+ topics.each do |data|
95
+ page_builder.add([data])
96
+ end
97
+ end
98
+
99
+ def proc_http(h)
100
+ response = h.request(@request)
101
+ operation_location = response['operation-location']
102
+
103
+ if operation_location
104
+ Embulk.logger.info "operation_location => #{operation_location}"
105
+ topics_request = Net::HTTP::Get.new(operation_location)
106
+ topics_request['Ocp-Apim-Subscription-Key'] = @subscription_key
107
+ loop do
108
+ topics_response = h.request(topics_request)
109
+ topics_response_body = topics_response.body
110
+ topics_response_hash = JSON.parse(topics_response_body)
111
+ status = topics_response_hash['status']
112
+ Embulk.logger.info "status => #{status}"
113
+ if status == 'Succeeded'
114
+ Embulk.logger.debug "topics_response_hash => #{topics_response_hash}"
115
+ return topics_response_hash
116
+ end
117
+ if status == 'Failed'
118
+ raise "topics_response_hash => #{topics_response_hash}"
119
+ end
120
+ sleep 60
121
+ end
122
+ end
123
+
124
+ JSON.parse(response.body)
125
+ end
126
+ end
127
+ end
128
+ end
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-filter-azure_text_analytics
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - toyama0919
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-02-10 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: 0.8.15
19
+ name: embulk
20
+ prerelease: false
21
+ type: :development
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 0.8.15
27
+ - !ruby/object:Gem::Dependency
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.10.6
33
+ name: bundler
34
+ prerelease: false
35
+ type: :development
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 1.10.6
41
+ - !ruby/object:Gem::Dependency
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: '10.0'
47
+ name: rake
48
+ prerelease: false
49
+ type: :development
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ description: Azure Text Analytics
56
+ email:
57
+ - toyama0919@gmail.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - ".gitignore"
63
+ - ".ruby-version"
64
+ - Gemfile
65
+ - LICENSE.txt
66
+ - README.md
67
+ - Rakefile
68
+ - embulk-filter-azure_text_analytics.gemspec
69
+ - lib/embulk/filter/azure_text_analytics.rb
70
+ - lib/embulk/filter/azure_text_analytics_topics.rb
71
+ homepage: https://github.com/toyama0919/embulk-filter-azure_text_analytics
72
+ licenses:
73
+ - MIT
74
+ metadata: {}
75
+ post_install_message:
76
+ rdoc_options: []
77
+ require_paths:
78
+ - lib
79
+ required_ruby_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ required_rubygems_version: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ requirements: []
90
+ rubyforge_project:
91
+ rubygems_version: 2.6.6
92
+ signing_key:
93
+ specification_version: 4
94
+ summary: Azure Text Analytics filter plugin for Embulk
95
+ test_files: []