embulk-filter-azure_text_analytics 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/README.md +6 -2
- data/embulk-filter-azure_text_analytics.gemspec +2 -2
- data/lib/embulk/filter/azure_text_analytics_topics.rb +62 -25
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0c87e0a6e5f569a09564b23881c78ef7022e3605
|
4
|
+
data.tar.gz: 057800f35d38cefed68ed8dee262f4cd4cd0ff2e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7051bbf8881458e3c40270c770c414c05334d75bca6e0bac18dad8fa68ddd4c12121cb7eef9021c4c7ae6bc4f99c4d5ff2826f4d344cb13d7418f6fcfa22307f
|
7
|
+
data.tar.gz: 238cd7a23073962212d7370e041b762eb0134c371f3748bb5724b938db533a32d1605babade73856f07b0a175b30cd8d5a54fc0f84de135cfb211b9ebdc5d925
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
jruby-9.1.5.0
|
1
|
+
jruby-9.1.5.0
|
data/README.md
CHANGED
@@ -28,7 +28,6 @@ Azure Text Analytics filter plugin for Embulk.
|
|
28
28
|
### sentiment
|
29
29
|
|
30
30
|
```yaml
|
31
|
-
# en,es,fr,pt
|
32
31
|
- type: azure_text_analytics
|
33
32
|
api_type: sentiment
|
34
33
|
key_name: target_key
|
@@ -71,7 +70,11 @@ Azure Text Analytics filter plugin for Embulk.
|
|
71
70
|
### keyPhrases
|
72
71
|
|
73
72
|
```yaml
|
74
|
-
|
73
|
+
exec:
|
74
|
+
max_threads: 1
|
75
|
+
min_output_tasks: 1
|
76
|
+
|
77
|
+
filters:
|
75
78
|
- type: azure_text_analytics_topics
|
76
79
|
out_key_name: _parsed
|
77
80
|
key_name: pr
|
@@ -82,6 +85,7 @@ Azure Text Analytics filter plugin for Embulk.
|
|
82
85
|
|
83
86
|
```
|
84
87
|
* required, over 100 documents.
|
88
|
+
* en,es,fr,pt support
|
85
89
|
|
86
90
|
## Build
|
87
91
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
|
2
2
|
Gem::Specification.new do |spec|
|
3
3
|
spec.name = "embulk-filter-azure_text_analytics"
|
4
|
-
spec.version = "0.
|
4
|
+
spec.version = "0.2.0"
|
5
5
|
spec.authors = ["toyama0919"]
|
6
6
|
spec.summary = "Azure Text Analytics filter plugin for Embulk"
|
7
7
|
spec.description = "Azure Text Analytics"
|
@@ -13,7 +13,7 @@ Gem::Specification.new do |spec|
|
|
13
13
|
spec.test_files = spec.files.grep(%r{^(test|spec)/})
|
14
14
|
spec.require_paths = ["lib"]
|
15
15
|
|
16
|
-
spec.add_development_dependency 'embulk', ['>= 0.8.
|
16
|
+
spec.add_development_dependency 'embulk', ['>= 0.8.16']
|
17
17
|
spec.add_development_dependency 'bundler', ['>= 1.10.6']
|
18
18
|
spec.add_development_dependency 'rake', ['>= 10.0']
|
19
19
|
end
|
@@ -18,14 +18,19 @@ module Embulk
|
|
18
18
|
"body_params" => config.param("body_params", :hash, default: {}),
|
19
19
|
"params" => config.param("params", :hash, default: {}),
|
20
20
|
"subscription_key" => config.param("subscription_key", :string),
|
21
|
+
"operation_id" => config.param("operation_id", :string, default: nil),
|
21
22
|
"stop_words" => config.param("stop_words", :array, default: nil),
|
22
23
|
"stop_phrases" => config.param("stop_phrases", :array, default: nil),
|
24
|
+
"id_format" => config.param("id_format", :string, default: nil),
|
25
|
+
"id_keys" => config.param("id_keys", :array, default: []),
|
23
26
|
}
|
24
27
|
|
25
|
-
|
26
|
-
Column.new(nil, task[
|
28
|
+
add_columns = [
|
29
|
+
Column.new(nil, task['out_key_name'], :json)
|
27
30
|
]
|
28
31
|
|
32
|
+
out_columns = in_schema + add_columns
|
33
|
+
|
29
34
|
yield(task, out_columns)
|
30
35
|
end
|
31
36
|
|
@@ -37,6 +42,11 @@ module Embulk
|
|
37
42
|
@out_key_name = task['out_key_name']
|
38
43
|
@stop_words = task['stop_words']
|
39
44
|
@stop_phrases = task['stop_phrases']
|
45
|
+
@id_format = task['id_format']
|
46
|
+
@id_keys = task['id_keys']
|
47
|
+
@operation_location = if task['operation_id']
|
48
|
+
"https://westus.api.cognitive.microsoft.com/text/analytics/v2.0/operations/" + task['operation_id']
|
49
|
+
end
|
40
50
|
|
41
51
|
uri_string = "#{ENDPOINT_PREFIX}/topics"
|
42
52
|
@uri = URI.parse(uri_string)
|
@@ -72,7 +82,11 @@ module Embulk
|
|
72
82
|
documents = []
|
73
83
|
records.each do |record|
|
74
84
|
document = {}
|
75
|
-
document_id =
|
85
|
+
document_id = if (@id_format && @id_keys)
|
86
|
+
generate_id(@id_format, record, @id_keys)
|
87
|
+
else
|
88
|
+
SecureRandom.uuid
|
89
|
+
end
|
76
90
|
document["language"] = @language if @language
|
77
91
|
document["id"] = document_id
|
78
92
|
document["text"] = record[@key_name]
|
@@ -86,43 +100,66 @@ module Embulk
|
|
86
100
|
@request.body = request_body_hash.to_json
|
87
101
|
Embulk.logger.debug "request body => #{@request.body}"
|
88
102
|
response_hash = @http.start do |h|
|
89
|
-
proc_http(h)
|
103
|
+
@operation_location ? proc_topics(h, @operation_location) : proc_http(h)
|
104
|
+
end
|
105
|
+
if response_hash.key?('innerError')
|
106
|
+
raise "ERROR => #{response_hash}"
|
90
107
|
end
|
91
|
-
|
92
108
|
topics = response_hash["operationProcessingResult"]["topics"]
|
109
|
+
topics_assignments = response_hash["operationProcessingResult"]["topicAssignments"]
|
110
|
+
|
111
|
+
topics_assignments_merged = topics_assignments.map do |topics_assignment|
|
112
|
+
selected_topics = topics.select { |topic| topics_assignment['topicId'] == topic['id'] }
|
113
|
+
result = topics_assignment.merge(selected_topics.first)
|
114
|
+
result.delete('id')
|
115
|
+
result
|
116
|
+
end
|
93
117
|
|
94
|
-
|
95
|
-
|
118
|
+
documents.each_with_index do |document, i|
|
119
|
+
record = records[i]
|
120
|
+
selected_topics_assignments = topics_assignments_merged.select{ |topics_assignment|
|
121
|
+
topics_assignment.delete('topicId')
|
122
|
+
topics_assignment['documentId'] == document['id']
|
123
|
+
}
|
124
|
+
page_builder.add(record.values + [selected_topics_assignments])
|
96
125
|
end
|
97
126
|
end
|
98
127
|
|
128
|
+
def generate_id(template, record, id_keys)
|
129
|
+
template % id_keys.map { |key| record[key] }
|
130
|
+
end
|
131
|
+
|
99
132
|
def proc_http(h)
|
100
133
|
response = h.request(@request)
|
101
134
|
operation_location = response['operation-location']
|
102
135
|
|
103
136
|
if operation_location
|
104
|
-
|
105
|
-
topics_request = Net::HTTP::Get.new(operation_location)
|
106
|
-
topics_request['Ocp-Apim-Subscription-Key'] = @subscription_key
|
107
|
-
loop do
|
108
|
-
topics_response = h.request(topics_request)
|
109
|
-
topics_response_body = topics_response.body
|
110
|
-
topics_response_hash = JSON.parse(topics_response_body)
|
111
|
-
status = topics_response_hash['status']
|
112
|
-
Embulk.logger.info "status => #{status}"
|
113
|
-
if status == 'Succeeded'
|
114
|
-
Embulk.logger.debug "topics_response_hash => #{topics_response_hash}"
|
115
|
-
return topics_response_hash
|
116
|
-
end
|
117
|
-
if status == 'Failed'
|
118
|
-
raise "topics_response_hash => #{topics_response_hash}"
|
119
|
-
end
|
120
|
-
sleep 60
|
121
|
-
end
|
137
|
+
return proc_topics(h, operation_location)
|
122
138
|
end
|
123
139
|
|
124
140
|
JSON.parse(response.body)
|
125
141
|
end
|
142
|
+
|
143
|
+
def proc_topics(h, operation_location)
|
144
|
+
Embulk.logger.info "operation_location => #{operation_location}"
|
145
|
+
topics_request = Net::HTTP::Get.new(operation_location)
|
146
|
+
topics_request['Ocp-Apim-Subscription-Key'] = @subscription_key
|
147
|
+
loop do
|
148
|
+
topics_response = h.request(topics_request)
|
149
|
+
topics_response_body = topics_response.body
|
150
|
+
topics_response_hash = JSON.parse(topics_response_body)
|
151
|
+
status = topics_response_hash['status']
|
152
|
+
Embulk.logger.info "status => #{status}"
|
153
|
+
if status == 'Succeeded'
|
154
|
+
Embulk.logger.debug "topics_response_hash => #{topics_response_hash}"
|
155
|
+
return topics_response_hash
|
156
|
+
end
|
157
|
+
if status == 'Failed'
|
158
|
+
raise "topics_response_hash => #{topics_response_hash}"
|
159
|
+
end
|
160
|
+
sleep 60
|
161
|
+
end
|
162
|
+
end
|
126
163
|
end
|
127
164
|
end
|
128
165
|
end
|
metadata
CHANGED
@@ -1,21 +1,21 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-azure_text_analytics
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- toyama0919
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-02-
|
11
|
+
date: 2017-02-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
15
15
|
requirements:
|
16
16
|
- - ">="
|
17
17
|
- !ruby/object:Gem::Version
|
18
|
-
version: 0.8.
|
18
|
+
version: 0.8.16
|
19
19
|
name: embulk
|
20
20
|
prerelease: false
|
21
21
|
type: :development
|
@@ -23,7 +23,7 @@ dependencies:
|
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 0.8.
|
26
|
+
version: 0.8.16
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
requirement: !ruby/object:Gem::Requirement
|
29
29
|
requirements:
|