embulk-filter-azure_text_analytics 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cac630a471bf5e67125d08e80f149d46eeccfcc3
4
- data.tar.gz: f1194a27bd82b07d1571fc555045e04c2abc96d2
3
+ metadata.gz: 0c87e0a6e5f569a09564b23881c78ef7022e3605
4
+ data.tar.gz: 057800f35d38cefed68ed8dee262f4cd4cd0ff2e
5
5
  SHA512:
6
- metadata.gz: fdc16277d95fad436d178c78cc391d89e948ebf39236440fa9263ee6c992e4e3ee953ddcd5aa3f414bf168e31edbcbea02d077cd3008246b51d21a09f9c157bf
7
- data.tar.gz: 73f376ee3f09ec3ad0f355108f7c5a922d28d8d28b19ae84c9843164532aa55e22aa93159fa97e0e2bcf202ebe8d7d650dda48504d9588846e6d70129c4146f4
6
+ metadata.gz: 7051bbf8881458e3c40270c770c414c05334d75bca6e0bac18dad8fa68ddd4c12121cb7eef9021c4c7ae6bc4f99c4d5ff2826f4d344cb13d7418f6fcfa22307f
7
+ data.tar.gz: 238cd7a23073962212d7370e041b762eb0134c371f3748bb5724b938db533a32d1605babade73856f07b0a175b30cd8d5a54fc0f84de135cfb211b9ebdc5d925
@@ -1 +1 @@
1
- jruby-9.1.5.0
1
+ jruby-9.1.5.0
data/README.md CHANGED
@@ -28,7 +28,6 @@ Azure Text Analytics filter plugin for Embulk.
28
28
  ### sentiment
29
29
 
30
30
  ```yaml
31
- # en,es,fr,pt
32
31
  - type: azure_text_analytics
33
32
  api_type: sentiment
34
33
  key_name: target_key
@@ -71,7 +70,11 @@ Azure Text Analytics filter plugin for Embulk.
71
70
  ### keyPhrases
72
71
 
73
72
  ```yaml
74
- # en,es,fr,pt
73
+ exec:
74
+ max_threads: 1
75
+ min_output_tasks: 1
76
+
77
+ filters:
75
78
  - type: azure_text_analytics_topics
76
79
  out_key_name: _parsed
77
80
  key_name: pr
@@ -82,6 +85,7 @@ Azure Text Analytics filter plugin for Embulk.
82
85
 
83
86
  ```
84
87
  * required, over 100 documents.
88
+ * en,es,fr,pt support
85
89
 
86
90
  ## Build
87
91
 
@@ -1,7 +1,7 @@
1
1
 
2
2
  Gem::Specification.new do |spec|
3
3
  spec.name = "embulk-filter-azure_text_analytics"
4
- spec.version = "0.1.0"
4
+ spec.version = "0.2.0"
5
5
  spec.authors = ["toyama0919"]
6
6
  spec.summary = "Azure Text Analytics filter plugin for Embulk"
7
7
  spec.description = "Azure Text Analytics"
@@ -13,7 +13,7 @@ Gem::Specification.new do |spec|
13
13
  spec.test_files = spec.files.grep(%r{^(test|spec)/})
14
14
  spec.require_paths = ["lib"]
15
15
 
16
- spec.add_development_dependency 'embulk', ['>= 0.8.15']
16
+ spec.add_development_dependency 'embulk', ['>= 0.8.16']
17
17
  spec.add_development_dependency 'bundler', ['>= 1.10.6']
18
18
  spec.add_development_dependency 'rake', ['>= 10.0']
19
19
  end
@@ -18,14 +18,19 @@ module Embulk
18
18
  "body_params" => config.param("body_params", :hash, default: {}),
19
19
  "params" => config.param("params", :hash, default: {}),
20
20
  "subscription_key" => config.param("subscription_key", :string),
21
+ "operation_id" => config.param("operation_id", :string, default: nil),
21
22
  "stop_words" => config.param("stop_words", :array, default: nil),
22
23
  "stop_phrases" => config.param("stop_phrases", :array, default: nil),
24
+ "id_format" => config.param("id_format", :string, default: nil),
25
+ "id_keys" => config.param("id_keys", :array, default: []),
23
26
  }
24
27
 
25
- out_columns = [
26
- Column.new(nil, task["out_key_name"], :json)
28
+ add_columns = [
29
+ Column.new(nil, task['out_key_name'], :json)
27
30
  ]
28
31
 
32
+ out_columns = in_schema + add_columns
33
+
29
34
  yield(task, out_columns)
30
35
  end
31
36
 
@@ -37,6 +42,11 @@ module Embulk
37
42
  @out_key_name = task['out_key_name']
38
43
  @stop_words = task['stop_words']
39
44
  @stop_phrases = task['stop_phrases']
45
+ @id_format = task['id_format']
46
+ @id_keys = task['id_keys']
47
+ @operation_location = if task['operation_id']
48
+ "https://westus.api.cognitive.microsoft.com/text/analytics/v2.0/operations/" + task['operation_id']
49
+ end
40
50
 
41
51
  uri_string = "#{ENDPOINT_PREFIX}/topics"
42
52
  @uri = URI.parse(uri_string)
@@ -72,7 +82,11 @@ module Embulk
72
82
  documents = []
73
83
  records.each do |record|
74
84
  document = {}
75
- document_id = SecureRandom.uuid
85
+ document_id = if (@id_format && @id_keys)
86
+ generate_id(@id_format, record, @id_keys)
87
+ else
88
+ SecureRandom.uuid
89
+ end
76
90
  document["language"] = @language if @language
77
91
  document["id"] = document_id
78
92
  document["text"] = record[@key_name]
@@ -86,43 +100,66 @@ module Embulk
86
100
  @request.body = request_body_hash.to_json
87
101
  Embulk.logger.debug "request body => #{@request.body}"
88
102
  response_hash = @http.start do |h|
89
- proc_http(h)
103
+ @operation_location ? proc_topics(h, @operation_location) : proc_http(h)
104
+ end
105
+ if response_hash.key?('innerError')
106
+ raise "ERROR => #{response_hash}"
90
107
  end
91
-
92
108
  topics = response_hash["operationProcessingResult"]["topics"]
109
+ topics_assignments = response_hash["operationProcessingResult"]["topicAssignments"]
110
+
111
+ topics_assignments_merged = topics_assignments.map do |topics_assignment|
112
+ selected_topics = topics.select { |topic| topics_assignment['topicId'] == topic['id'] }
113
+ result = topics_assignment.merge(selected_topics.first)
114
+ result.delete('id')
115
+ result
116
+ end
93
117
 
94
- topics.each do |data|
95
- page_builder.add([data])
118
+ documents.each_with_index do |document, i|
119
+ record = records[i]
120
+ selected_topics_assignments = topics_assignments_merged.select{ |topics_assignment|
121
+ topics_assignment.delete('topicId')
122
+ topics_assignment['documentId'] == document['id']
123
+ }
124
+ page_builder.add(record.values + [selected_topics_assignments])
96
125
  end
97
126
  end
98
127
 
128
+ def generate_id(template, record, id_keys)
129
+ template % id_keys.map { |key| record[key] }
130
+ end
131
+
99
132
  def proc_http(h)
100
133
  response = h.request(@request)
101
134
  operation_location = response['operation-location']
102
135
 
103
136
  if operation_location
104
- Embulk.logger.info "operation_location => #{operation_location}"
105
- topics_request = Net::HTTP::Get.new(operation_location)
106
- topics_request['Ocp-Apim-Subscription-Key'] = @subscription_key
107
- loop do
108
- topics_response = h.request(topics_request)
109
- topics_response_body = topics_response.body
110
- topics_response_hash = JSON.parse(topics_response_body)
111
- status = topics_response_hash['status']
112
- Embulk.logger.info "status => #{status}"
113
- if status == 'Succeeded'
114
- Embulk.logger.debug "topics_response_hash => #{topics_response_hash}"
115
- return topics_response_hash
116
- end
117
- if status == 'Failed'
118
- raise "topics_response_hash => #{topics_response_hash}"
119
- end
120
- sleep 60
121
- end
137
+ return proc_topics(h, operation_location)
122
138
  end
123
139
 
124
140
  JSON.parse(response.body)
125
141
  end
142
+
143
+ def proc_topics(h, operation_location)
144
+ Embulk.logger.info "operation_location => #{operation_location}"
145
+ topics_request = Net::HTTP::Get.new(operation_location)
146
+ topics_request['Ocp-Apim-Subscription-Key'] = @subscription_key
147
+ loop do
148
+ topics_response = h.request(topics_request)
149
+ topics_response_body = topics_response.body
150
+ topics_response_hash = JSON.parse(topics_response_body)
151
+ status = topics_response_hash['status']
152
+ Embulk.logger.info "status => #{status}"
153
+ if status == 'Succeeded'
154
+ Embulk.logger.debug "topics_response_hash => #{topics_response_hash}"
155
+ return topics_response_hash
156
+ end
157
+ if status == 'Failed'
158
+ raise "topics_response_hash => #{topics_response_hash}"
159
+ end
160
+ sleep 60
161
+ end
162
+ end
126
163
  end
127
164
  end
128
165
  end
metadata CHANGED
@@ -1,21 +1,21 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-filter-azure_text_analytics
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - toyama0919
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-02-10 00:00:00.000000000 Z
11
+ date: 2017-02-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
15
15
  requirements:
16
16
  - - ">="
17
17
  - !ruby/object:Gem::Version
18
- version: 0.8.15
18
+ version: 0.8.16
19
19
  name: embulk
20
20
  prerelease: false
21
21
  type: :development
@@ -23,7 +23,7 @@ dependencies:
23
23
  requirements:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: 0.8.15
26
+ version: 0.8.16
27
27
  - !ruby/object:Gem::Dependency
28
28
  requirement: !ruby/object:Gem::Requirement
29
29
  requirements: