embulk-filter-azure_text_analytics 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cac630a471bf5e67125d08e80f149d46eeccfcc3
4
- data.tar.gz: f1194a27bd82b07d1571fc555045e04c2abc96d2
3
+ metadata.gz: 0c87e0a6e5f569a09564b23881c78ef7022e3605
4
+ data.tar.gz: 057800f35d38cefed68ed8dee262f4cd4cd0ff2e
5
5
  SHA512:
6
- metadata.gz: fdc16277d95fad436d178c78cc391d89e948ebf39236440fa9263ee6c992e4e3ee953ddcd5aa3f414bf168e31edbcbea02d077cd3008246b51d21a09f9c157bf
7
- data.tar.gz: 73f376ee3f09ec3ad0f355108f7c5a922d28d8d28b19ae84c9843164532aa55e22aa93159fa97e0e2bcf202ebe8d7d650dda48504d9588846e6d70129c4146f4
6
+ metadata.gz: 7051bbf8881458e3c40270c770c414c05334d75bca6e0bac18dad8fa68ddd4c12121cb7eef9021c4c7ae6bc4f99c4d5ff2826f4d344cb13d7418f6fcfa22307f
7
+ data.tar.gz: 238cd7a23073962212d7370e041b762eb0134c371f3748bb5724b938db533a32d1605babade73856f07b0a175b30cd8d5a54fc0f84de135cfb211b9ebdc5d925
@@ -1 +1 @@
1
- jruby-9.1.5.0
1
+ jruby-9.1.5.0
data/README.md CHANGED
@@ -28,7 +28,6 @@ Azure Text Analytics filter plugin for Embulk.
28
28
  ### sentiment
29
29
 
30
30
  ```yaml
31
- # en,es,fr,pt
32
31
  - type: azure_text_analytics
33
32
  api_type: sentiment
34
33
  key_name: target_key
@@ -71,7 +70,11 @@ Azure Text Analytics filter plugin for Embulk.
71
70
  ### keyPhrases
72
71
 
73
72
  ```yaml
74
- # en,es,fr,pt
73
+ exec:
74
+ max_threads: 1
75
+ min_output_tasks: 1
76
+
77
+ filters:
75
78
  - type: azure_text_analytics_topics
76
79
  out_key_name: _parsed
77
80
  key_name: pr
@@ -82,6 +85,7 @@ Azure Text Analytics filter plugin for Embulk.
82
85
 
83
86
  ```
84
87
  * required, over 100 documents.
88
+ * en,es,fr,pt support
85
89
 
86
90
  ## Build
87
91
 
@@ -1,7 +1,7 @@
1
1
 
2
2
  Gem::Specification.new do |spec|
3
3
  spec.name = "embulk-filter-azure_text_analytics"
4
- spec.version = "0.1.0"
4
+ spec.version = "0.2.0"
5
5
  spec.authors = ["toyama0919"]
6
6
  spec.summary = "Azure Text Analytics filter plugin for Embulk"
7
7
  spec.description = "Azure Text Analytics"
@@ -13,7 +13,7 @@ Gem::Specification.new do |spec|
13
13
  spec.test_files = spec.files.grep(%r{^(test|spec)/})
14
14
  spec.require_paths = ["lib"]
15
15
 
16
- spec.add_development_dependency 'embulk', ['>= 0.8.15']
16
+ spec.add_development_dependency 'embulk', ['>= 0.8.16']
17
17
  spec.add_development_dependency 'bundler', ['>= 1.10.6']
18
18
  spec.add_development_dependency 'rake', ['>= 10.0']
19
19
  end
@@ -18,14 +18,19 @@ module Embulk
18
18
  "body_params" => config.param("body_params", :hash, default: {}),
19
19
  "params" => config.param("params", :hash, default: {}),
20
20
  "subscription_key" => config.param("subscription_key", :string),
21
+ "operation_id" => config.param("operation_id", :string, default: nil),
21
22
  "stop_words" => config.param("stop_words", :array, default: nil),
22
23
  "stop_phrases" => config.param("stop_phrases", :array, default: nil),
24
+ "id_format" => config.param("id_format", :string, default: nil),
25
+ "id_keys" => config.param("id_keys", :array, default: []),
23
26
  }
24
27
 
25
- out_columns = [
26
- Column.new(nil, task["out_key_name"], :json)
28
+ add_columns = [
29
+ Column.new(nil, task['out_key_name'], :json)
27
30
  ]
28
31
 
32
+ out_columns = in_schema + add_columns
33
+
29
34
  yield(task, out_columns)
30
35
  end
31
36
 
@@ -37,6 +42,11 @@ module Embulk
37
42
  @out_key_name = task['out_key_name']
38
43
  @stop_words = task['stop_words']
39
44
  @stop_phrases = task['stop_phrases']
45
+ @id_format = task['id_format']
46
+ @id_keys = task['id_keys']
47
+ @operation_location = if task['operation_id']
48
+ "https://westus.api.cognitive.microsoft.com/text/analytics/v2.0/operations/" + task['operation_id']
49
+ end
40
50
 
41
51
  uri_string = "#{ENDPOINT_PREFIX}/topics"
42
52
  @uri = URI.parse(uri_string)
@@ -72,7 +82,11 @@ module Embulk
72
82
  documents = []
73
83
  records.each do |record|
74
84
  document = {}
75
- document_id = SecureRandom.uuid
85
+ document_id = if (@id_format && @id_keys)
86
+ generate_id(@id_format, record, @id_keys)
87
+ else
88
+ SecureRandom.uuid
89
+ end
76
90
  document["language"] = @language if @language
77
91
  document["id"] = document_id
78
92
  document["text"] = record[@key_name]
@@ -86,43 +100,66 @@ module Embulk
86
100
  @request.body = request_body_hash.to_json
87
101
  Embulk.logger.debug "request body => #{@request.body}"
88
102
  response_hash = @http.start do |h|
89
- proc_http(h)
103
+ @operation_location ? proc_topics(h, @operation_location) : proc_http(h)
104
+ end
105
+ if response_hash.key?('innerError')
106
+ raise "ERROR => #{response_hash}"
90
107
  end
91
-
92
108
  topics = response_hash["operationProcessingResult"]["topics"]
109
+ topics_assignments = response_hash["operationProcessingResult"]["topicAssignments"]
110
+
111
+ topics_assignments_merged = topics_assignments.map do |topics_assignment|
112
+ selected_topics = topics.select { |topic| topics_assignment['topicId'] == topic['id'] }
113
+ result = topics_assignment.merge(selected_topics.first)
114
+ result.delete('id')
115
+ result
116
+ end
93
117
 
94
- topics.each do |data|
95
- page_builder.add([data])
118
+ documents.each_with_index do |document, i|
119
+ record = records[i]
120
+ selected_topics_assignments = topics_assignments_merged.select{ |topics_assignment|
121
+ topics_assignment.delete('topicId')
122
+ topics_assignment['documentId'] == document['id']
123
+ }
124
+ page_builder.add(record.values + [selected_topics_assignments])
96
125
  end
97
126
  end
98
127
 
128
+ def generate_id(template, record, id_keys)
129
+ template % id_keys.map { |key| record[key] }
130
+ end
131
+
99
132
  def proc_http(h)
100
133
  response = h.request(@request)
101
134
  operation_location = response['operation-location']
102
135
 
103
136
  if operation_location
104
- Embulk.logger.info "operation_location => #{operation_location}"
105
- topics_request = Net::HTTP::Get.new(operation_location)
106
- topics_request['Ocp-Apim-Subscription-Key'] = @subscription_key
107
- loop do
108
- topics_response = h.request(topics_request)
109
- topics_response_body = topics_response.body
110
- topics_response_hash = JSON.parse(topics_response_body)
111
- status = topics_response_hash['status']
112
- Embulk.logger.info "status => #{status}"
113
- if status == 'Succeeded'
114
- Embulk.logger.debug "topics_response_hash => #{topics_response_hash}"
115
- return topics_response_hash
116
- end
117
- if status == 'Failed'
118
- raise "topics_response_hash => #{topics_response_hash}"
119
- end
120
- sleep 60
121
- end
137
+ return proc_topics(h, operation_location)
122
138
  end
123
139
 
124
140
  JSON.parse(response.body)
125
141
  end
142
+
143
+ def proc_topics(h, operation_location)
144
+ Embulk.logger.info "operation_location => #{operation_location}"
145
+ topics_request = Net::HTTP::Get.new(operation_location)
146
+ topics_request['Ocp-Apim-Subscription-Key'] = @subscription_key
147
+ loop do
148
+ topics_response = h.request(topics_request)
149
+ topics_response_body = topics_response.body
150
+ topics_response_hash = JSON.parse(topics_response_body)
151
+ status = topics_response_hash['status']
152
+ Embulk.logger.info "status => #{status}"
153
+ if status == 'Succeeded'
154
+ Embulk.logger.debug "topics_response_hash => #{topics_response_hash}"
155
+ return topics_response_hash
156
+ end
157
+ if status == 'Failed'
158
+ raise "topics_response_hash => #{topics_response_hash}"
159
+ end
160
+ sleep 60
161
+ end
162
+ end
126
163
  end
127
164
  end
128
165
  end
metadata CHANGED
@@ -1,21 +1,21 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-azure_text_analytics
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - toyama0919
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-02-10 00:00:00.000000000 Z
11
+ date: 2017-02-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
15
15
  requirements:
16
16
  - - ">="
17
17
  - !ruby/object:Gem::Version
18
- version: 0.8.15
18
+ version: 0.8.16
19
19
  name: embulk
20
20
  prerelease: false
21
21
  type: :development
@@ -23,7 +23,7 @@ dependencies:
23
23
  requirements:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: 0.8.15
26
+ version: 0.8.16
27
27
  - !ruby/object:Gem::Dependency
28
28
  requirement: !ruby/object:Gem::Requirement
29
29
  requirements: