embulk-filter-azure_text_analytics 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/README.md +6 -2
- data/embulk-filter-azure_text_analytics.gemspec +2 -2
- data/lib/embulk/filter/azure_text_analytics_topics.rb +62 -25
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0c87e0a6e5f569a09564b23881c78ef7022e3605
|
4
|
+
data.tar.gz: 057800f35d38cefed68ed8dee262f4cd4cd0ff2e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7051bbf8881458e3c40270c770c414c05334d75bca6e0bac18dad8fa68ddd4c12121cb7eef9021c4c7ae6bc4f99c4d5ff2826f4d344cb13d7418f6fcfa22307f
|
7
|
+
data.tar.gz: 238cd7a23073962212d7370e041b762eb0134c371f3748bb5724b938db533a32d1605babade73856f07b0a175b30cd8d5a54fc0f84de135cfb211b9ebdc5d925
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
jruby-9.1.5.0
|
1
|
+
jruby-9.1.5.0
|
data/README.md
CHANGED
@@ -28,7 +28,6 @@ Azure Text Analytics filter plugin for Embulk.
|
|
28
28
|
### sentiment
|
29
29
|
|
30
30
|
```yaml
|
31
|
-
# en,es,fr,pt
|
32
31
|
- type: azure_text_analytics
|
33
32
|
api_type: sentiment
|
34
33
|
key_name: target_key
|
@@ -71,7 +70,11 @@ Azure Text Analytics filter plugin for Embulk.
|
|
71
70
|
### keyPhrases
|
72
71
|
|
73
72
|
```yaml
|
74
|
-
|
73
|
+
exec:
|
74
|
+
max_threads: 1
|
75
|
+
min_output_tasks: 1
|
76
|
+
|
77
|
+
filters:
|
75
78
|
- type: azure_text_analytics_topics
|
76
79
|
out_key_name: _parsed
|
77
80
|
key_name: pr
|
@@ -82,6 +85,7 @@ Azure Text Analytics filter plugin for Embulk.
|
|
82
85
|
|
83
86
|
```
|
84
87
|
* required, over 100 documents.
|
88
|
+
* en,es,fr,pt support
|
85
89
|
|
86
90
|
## Build
|
87
91
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
|
2
2
|
Gem::Specification.new do |spec|
|
3
3
|
spec.name = "embulk-filter-azure_text_analytics"
|
4
|
-
spec.version = "0.
|
4
|
+
spec.version = "0.2.0"
|
5
5
|
spec.authors = ["toyama0919"]
|
6
6
|
spec.summary = "Azure Text Analytics filter plugin for Embulk"
|
7
7
|
spec.description = "Azure Text Analytics"
|
@@ -13,7 +13,7 @@ Gem::Specification.new do |spec|
|
|
13
13
|
spec.test_files = spec.files.grep(%r{^(test|spec)/})
|
14
14
|
spec.require_paths = ["lib"]
|
15
15
|
|
16
|
-
spec.add_development_dependency 'embulk', ['>= 0.8.
|
16
|
+
spec.add_development_dependency 'embulk', ['>= 0.8.16']
|
17
17
|
spec.add_development_dependency 'bundler', ['>= 1.10.6']
|
18
18
|
spec.add_development_dependency 'rake', ['>= 10.0']
|
19
19
|
end
|
@@ -18,14 +18,19 @@ module Embulk
|
|
18
18
|
"body_params" => config.param("body_params", :hash, default: {}),
|
19
19
|
"params" => config.param("params", :hash, default: {}),
|
20
20
|
"subscription_key" => config.param("subscription_key", :string),
|
21
|
+
"operation_id" => config.param("operation_id", :string, default: nil),
|
21
22
|
"stop_words" => config.param("stop_words", :array, default: nil),
|
22
23
|
"stop_phrases" => config.param("stop_phrases", :array, default: nil),
|
24
|
+
"id_format" => config.param("id_format", :string, default: nil),
|
25
|
+
"id_keys" => config.param("id_keys", :array, default: []),
|
23
26
|
}
|
24
27
|
|
25
|
-
|
26
|
-
Column.new(nil, task[
|
28
|
+
add_columns = [
|
29
|
+
Column.new(nil, task['out_key_name'], :json)
|
27
30
|
]
|
28
31
|
|
32
|
+
out_columns = in_schema + add_columns
|
33
|
+
|
29
34
|
yield(task, out_columns)
|
30
35
|
end
|
31
36
|
|
@@ -37,6 +42,11 @@ module Embulk
|
|
37
42
|
@out_key_name = task['out_key_name']
|
38
43
|
@stop_words = task['stop_words']
|
39
44
|
@stop_phrases = task['stop_phrases']
|
45
|
+
@id_format = task['id_format']
|
46
|
+
@id_keys = task['id_keys']
|
47
|
+
@operation_location = if task['operation_id']
|
48
|
+
"https://westus.api.cognitive.microsoft.com/text/analytics/v2.0/operations/" + task['operation_id']
|
49
|
+
end
|
40
50
|
|
41
51
|
uri_string = "#{ENDPOINT_PREFIX}/topics"
|
42
52
|
@uri = URI.parse(uri_string)
|
@@ -72,7 +82,11 @@ module Embulk
|
|
72
82
|
documents = []
|
73
83
|
records.each do |record|
|
74
84
|
document = {}
|
75
|
-
document_id =
|
85
|
+
document_id = if (@id_format && @id_keys)
|
86
|
+
generate_id(@id_format, record, @id_keys)
|
87
|
+
else
|
88
|
+
SecureRandom.uuid
|
89
|
+
end
|
76
90
|
document["language"] = @language if @language
|
77
91
|
document["id"] = document_id
|
78
92
|
document["text"] = record[@key_name]
|
@@ -86,43 +100,66 @@ module Embulk
|
|
86
100
|
@request.body = request_body_hash.to_json
|
87
101
|
Embulk.logger.debug "request body => #{@request.body}"
|
88
102
|
response_hash = @http.start do |h|
|
89
|
-
proc_http(h)
|
103
|
+
@operation_location ? proc_topics(h, @operation_location) : proc_http(h)
|
104
|
+
end
|
105
|
+
if response_hash.key?('innerError')
|
106
|
+
raise "ERROR => #{response_hash}"
|
90
107
|
end
|
91
|
-
|
92
108
|
topics = response_hash["operationProcessingResult"]["topics"]
|
109
|
+
topics_assignments = response_hash["operationProcessingResult"]["topicAssignments"]
|
110
|
+
|
111
|
+
topics_assignments_merged = topics_assignments.map do |topics_assignment|
|
112
|
+
selected_topics = topics.select { |topic| topics_assignment['topicId'] == topic['id'] }
|
113
|
+
result = topics_assignment.merge(selected_topics.first)
|
114
|
+
result.delete('id')
|
115
|
+
result
|
116
|
+
end
|
93
117
|
|
94
|
-
|
95
|
-
|
118
|
+
documents.each_with_index do |document, i|
|
119
|
+
record = records[i]
|
120
|
+
selected_topics_assignments = topics_assignments_merged.select{ |topics_assignment|
|
121
|
+
topics_assignment.delete('topicId')
|
122
|
+
topics_assignment['documentId'] == document['id']
|
123
|
+
}
|
124
|
+
page_builder.add(record.values + [selected_topics_assignments])
|
96
125
|
end
|
97
126
|
end
|
98
127
|
|
128
|
+
def generate_id(template, record, id_keys)
|
129
|
+
template % id_keys.map { |key| record[key] }
|
130
|
+
end
|
131
|
+
|
99
132
|
def proc_http(h)
|
100
133
|
response = h.request(@request)
|
101
134
|
operation_location = response['operation-location']
|
102
135
|
|
103
136
|
if operation_location
|
104
|
-
|
105
|
-
topics_request = Net::HTTP::Get.new(operation_location)
|
106
|
-
topics_request['Ocp-Apim-Subscription-Key'] = @subscription_key
|
107
|
-
loop do
|
108
|
-
topics_response = h.request(topics_request)
|
109
|
-
topics_response_body = topics_response.body
|
110
|
-
topics_response_hash = JSON.parse(topics_response_body)
|
111
|
-
status = topics_response_hash['status']
|
112
|
-
Embulk.logger.info "status => #{status}"
|
113
|
-
if status == 'Succeeded'
|
114
|
-
Embulk.logger.debug "topics_response_hash => #{topics_response_hash}"
|
115
|
-
return topics_response_hash
|
116
|
-
end
|
117
|
-
if status == 'Failed'
|
118
|
-
raise "topics_response_hash => #{topics_response_hash}"
|
119
|
-
end
|
120
|
-
sleep 60
|
121
|
-
end
|
137
|
+
return proc_topics(h, operation_location)
|
122
138
|
end
|
123
139
|
|
124
140
|
JSON.parse(response.body)
|
125
141
|
end
|
142
|
+
|
143
|
+
def proc_topics(h, operation_location)
|
144
|
+
Embulk.logger.info "operation_location => #{operation_location}"
|
145
|
+
topics_request = Net::HTTP::Get.new(operation_location)
|
146
|
+
topics_request['Ocp-Apim-Subscription-Key'] = @subscription_key
|
147
|
+
loop do
|
148
|
+
topics_response = h.request(topics_request)
|
149
|
+
topics_response_body = topics_response.body
|
150
|
+
topics_response_hash = JSON.parse(topics_response_body)
|
151
|
+
status = topics_response_hash['status']
|
152
|
+
Embulk.logger.info "status => #{status}"
|
153
|
+
if status == 'Succeeded'
|
154
|
+
Embulk.logger.debug "topics_response_hash => #{topics_response_hash}"
|
155
|
+
return topics_response_hash
|
156
|
+
end
|
157
|
+
if status == 'Failed'
|
158
|
+
raise "topics_response_hash => #{topics_response_hash}"
|
159
|
+
end
|
160
|
+
sleep 60
|
161
|
+
end
|
162
|
+
end
|
126
163
|
end
|
127
164
|
end
|
128
165
|
end
|
metadata
CHANGED
@@ -1,21 +1,21 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-azure_text_analytics
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- toyama0919
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-02-
|
11
|
+
date: 2017-02-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
15
15
|
requirements:
|
16
16
|
- - ">="
|
17
17
|
- !ruby/object:Gem::Version
|
18
|
-
version: 0.8.
|
18
|
+
version: 0.8.16
|
19
19
|
name: embulk
|
20
20
|
prerelease: false
|
21
21
|
type: :development
|
@@ -23,7 +23,7 @@ dependencies:
|
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 0.8.
|
26
|
+
version: 0.8.16
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
requirement: !ruby/object:Gem::Requirement
|
29
29
|
requirements:
|