opener-property-tagger 3.0.5 → 3.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/opener/property_tagger.rb +7 -14
- data/lib/opener/property_tagger/aspects_cache.rb +49 -0
- data/lib/opener/property_tagger/cli.rb +4 -1
- data/lib/opener/property_tagger/processor.rb +38 -33
- data/lib/opener/property_tagger/version.rb +1 -1
- data/opener-property-tagger.gemspec +1 -0
- data/task/test.rake +2 -1
- metadata +17 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c3d36e45bbff187579cbb8f2275d53261ed13030
|
4
|
+
data.tar.gz: 2850b27e9876336083dde1af17083658be60ef1c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f95a22d030ff6dc1685ee117ddae2f0d104fce5b187e7bb2062b96dd98ba035dd55c6b9f21266a63e968ff42d9e0869ce762786c7a2d7f5b53b6cc63d5aff4a2
|
7
|
+
data.tar.gz: be9bfed3bb90e26a111e03973a3ec543780e886be08d7102dad057e81917e1323c599198f81d79da58257d85e5adafaf507454a0fa6e30105fc8210698c94e17
|
@@ -1,12 +1,14 @@
|
|
1
1
|
require 'open3'
|
2
2
|
require 'slop'
|
3
3
|
require 'oga'
|
4
|
+
require 'monitor'
|
4
5
|
|
5
6
|
require 'rexml/document'
|
6
7
|
require 'rexml/formatters/pretty'
|
7
8
|
|
8
9
|
require_relative 'property_tagger/version'
|
9
10
|
require_relative 'property_tagger/cli'
|
11
|
+
require_relative 'property_tagger/aspects_cache'
|
10
12
|
require_relative 'property_tagger/processor'
|
11
13
|
|
12
14
|
module Opener
|
@@ -52,24 +54,15 @@ module Opener
|
|
52
54
|
end
|
53
55
|
|
54
56
|
##
|
55
|
-
# Processes the input
|
56
|
-
# STDERR and an object containing process information.
|
57
|
+
# Processes the input KAF document.
|
57
58
|
#
|
58
|
-
# @param [String] input
|
59
|
-
# @return [
|
59
|
+
# @param [String] input
|
60
|
+
# @return [String]
|
60
61
|
#
|
61
62
|
def run(input)
|
62
|
-
|
63
|
-
|
64
|
-
return output
|
65
|
-
end
|
66
|
-
|
67
|
-
protected
|
68
|
-
|
69
|
-
def process(input)
|
70
|
-
processor = Processor.new(input, path, !options[:no_time])
|
63
|
+
timestamp = !options[:no_time]
|
71
64
|
|
72
|
-
return
|
65
|
+
return Processor.new(input, path, timestamp, options[:pretty]).process
|
73
66
|
end
|
74
67
|
end # PolarityTagger
|
75
68
|
end # Opener
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Opener
|
2
|
+
class PropertyTagger
|
3
|
+
##
|
4
|
+
# Thread-safe cache for storing the contents of aspect files.
|
5
|
+
#
|
6
|
+
class AspectsCache
|
7
|
+
include MonitorMixin
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
super
|
11
|
+
|
12
|
+
@cache = {}
|
13
|
+
end
|
14
|
+
|
15
|
+
##
|
16
|
+
# Returns the aspects for the given file path. If the aspects don't exist
|
17
|
+
# they are first loaded into the cache.
|
18
|
+
#
|
19
|
+
# @param [String] path
|
20
|
+
#
|
21
|
+
def [](path)
|
22
|
+
synchronize do
|
23
|
+
@cache[path] = load_aspects(path) unless @cache.key?(path)
|
24
|
+
end
|
25
|
+
|
26
|
+
return @cache[path]
|
27
|
+
end
|
28
|
+
|
29
|
+
alias_method :get, :[]
|
30
|
+
|
31
|
+
##
|
32
|
+
# Loads the aspects of the given path.
|
33
|
+
#
|
34
|
+
# @param [String] path
|
35
|
+
#
|
36
|
+
def load_aspects(path)
|
37
|
+
mapping = Hash.new { |hash, key| hash[key] = [] }
|
38
|
+
|
39
|
+
File.foreach(path) do |line|
|
40
|
+
lemma, pos, aspect = line.chomp.split("\t")
|
41
|
+
|
42
|
+
mapping[lemma.to_sym] << aspect
|
43
|
+
end
|
44
|
+
|
45
|
+
return mapping
|
46
|
+
end
|
47
|
+
end # AspectsCache
|
48
|
+
end # PropertyTagger
|
49
|
+
end # Opener
|
@@ -56,10 +56,13 @@ Examples:
|
|
56
56
|
|
57
57
|
on :'no-time', 'Disables adding of timestamps'
|
58
58
|
|
59
|
+
on :ugly, 'Disables pretty formatting of XML (faster)'
|
60
|
+
|
59
61
|
run do |opts, args|
|
60
62
|
tagger = PropertyTagger.new(
|
61
63
|
:args => args,
|
62
|
-
:no_time => opts[:'no-time']
|
64
|
+
:no_time => opts[:'no-time'],
|
65
|
+
:pretty => !opts[:ugly]
|
63
66
|
)
|
64
67
|
|
65
68
|
input = STDIN.tty? ? nil : STDIN.read
|
@@ -4,13 +4,27 @@ module Opener
|
|
4
4
|
# Class that applies property tagging to a given input KAF file.
|
5
5
|
#
|
6
6
|
class Processor
|
7
|
-
attr_accessor :document, :aspects_path, :
|
8
|
-
:timestamp
|
7
|
+
attr_accessor :document, :aspects_path, :timestamp, :pretty
|
9
8
|
|
10
|
-
|
9
|
+
##
|
10
|
+
# Global cache used for storing loaded aspects.
|
11
|
+
#
|
12
|
+
# @return [Opener::PropertyTagger::AspectsCache.new]
|
13
|
+
#
|
14
|
+
ASPECTS_CACHE = AspectsCache.new
|
15
|
+
|
16
|
+
##
|
17
|
+
# @param [String|IO] file The KAF file/input to process.
|
18
|
+
# @param [String] aspects_path Path to the aspects.
|
19
|
+
# @param [TrueClass|FalseClass] timestamp Add timestamps to the KAF.
|
20
|
+
# @param [TrueClass|FalseClass] pretty Enable pretty formatting, disabled
|
21
|
+
# by default due to the performance overhead.
|
22
|
+
#
|
23
|
+
def initialize(file, aspects_path, timestamp = true, pretty = false)
|
11
24
|
@document = Oga.parse_xml(file)
|
12
25
|
@aspects_path = aspects_path
|
13
26
|
@timestamp = timestamp
|
27
|
+
@pretty = pretty
|
14
28
|
|
15
29
|
raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
|
16
30
|
end
|
@@ -20,64 +34,52 @@ module Opener
|
|
20
34
|
# @return [String]
|
21
35
|
#
|
22
36
|
def process
|
23
|
-
@language = get_language
|
24
|
-
@aspects = load_aspects
|
25
|
-
@terms = get_terms
|
26
|
-
|
27
37
|
existing_aspects = extract_aspects
|
28
38
|
|
29
39
|
add_features_layer
|
30
40
|
add_properties_layer
|
31
41
|
|
32
|
-
|
42
|
+
existing_aspects.each_with_index do |(key, value), index|
|
43
|
+
index += 1
|
33
44
|
|
34
|
-
existing_aspects.each_pair do |key,value|
|
35
45
|
add_property(key, value, index)
|
36
|
-
index += 1
|
37
46
|
end
|
38
47
|
|
39
48
|
add_linguistic_processor
|
40
49
|
|
41
|
-
return pretty_print(document)
|
50
|
+
return pretty ? pretty_print(document) : document.to_xml
|
42
51
|
end
|
43
52
|
|
44
53
|
##
|
45
|
-
# Loads the aspects from the txt file
|
46
54
|
# @return [Hash]
|
47
55
|
#
|
48
|
-
def
|
49
|
-
|
50
|
-
|
51
|
-
File.foreach(aspects_file) do |line|
|
52
|
-
lemma, pos, aspect = line.gsub("\n", "").split("\t")
|
53
|
-
|
54
|
-
aspects_hash[lemma.to_sym] = [] unless aspects_hash[lemma.to_sym]
|
55
|
-
aspects_hash[lemma.to_sym] << aspect
|
56
|
-
end
|
57
|
-
|
58
|
-
return aspects_hash
|
56
|
+
def aspects
|
57
|
+
return ASPECTS_CACHE[aspects_file]
|
59
58
|
end
|
60
59
|
|
61
60
|
##
|
62
61
|
# Get the language of the input file.
|
62
|
+
#
|
63
63
|
# @return [String]
|
64
64
|
#
|
65
|
-
def
|
66
|
-
document.at_xpath('KAF').get('xml:lang')
|
65
|
+
def language
|
66
|
+
return @language ||= document.at_xpath('KAF').get('xml:lang')
|
67
67
|
end
|
68
68
|
|
69
69
|
##
|
70
70
|
# Get the terms from the input file
|
71
71
|
# @return [Hash]
|
72
72
|
#
|
73
|
-
def
|
74
|
-
|
73
|
+
def terms
|
74
|
+
unless @terms
|
75
|
+
@terms = {}
|
75
76
|
|
76
|
-
|
77
|
-
|
77
|
+
document.xpath('KAF/terms/term').each do |term|
|
78
|
+
@terms[term.get('tid').to_sym] = term.get('lemma')
|
79
|
+
end
|
78
80
|
end
|
79
81
|
|
80
|
-
return
|
82
|
+
return @terms
|
81
83
|
end
|
82
84
|
|
83
85
|
##
|
@@ -93,7 +95,7 @@ module Opener
|
|
93
95
|
# lemmas) belong to a property.
|
94
96
|
max_ngram = 2
|
95
97
|
|
96
|
-
uniq_aspects = {}
|
98
|
+
uniq_aspects = Hash.new { |hash, key| hash[key] = [] }
|
97
99
|
|
98
100
|
while current_token < terms.count
|
99
101
|
(0..max_ngram).each do |tam_ngram|
|
@@ -107,7 +109,6 @@ module Opener
|
|
107
109
|
properties.uniq.each do |property|
|
108
110
|
next if !property or property.strip.empty?
|
109
111
|
|
110
|
-
uniq_aspects[property.to_sym] = [] unless uniq_aspects[property.to_sym]
|
111
112
|
uniq_aspects[property.to_sym] << [ids,ngram]
|
112
113
|
end
|
113
114
|
end
|
@@ -223,8 +224,12 @@ module Opener
|
|
223
224
|
return !!document.at_xpath('KAF')
|
224
225
|
end
|
225
226
|
|
227
|
+
##
|
228
|
+
# @return [String]
|
229
|
+
#
|
226
230
|
def aspects_file
|
227
|
-
return
|
231
|
+
return @aspects_file ||=
|
232
|
+
File.expand_path("#{aspects_path}/#{language}.txt", __FILE__)
|
228
233
|
end
|
229
234
|
end # Processor
|
230
235
|
end # PropertyTagger
|
data/task/test.rake
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-property-tagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0.
|
4
|
+
version: 3.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-02-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: opener-daemons
|
@@ -108,6 +108,20 @@ dependencies:
|
|
108
108
|
- - ">="
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: benchmark-ips
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '2.0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '2.0'
|
111
125
|
description: Property tagger for hotels in Dutch and English.
|
112
126
|
email:
|
113
127
|
executables:
|
@@ -125,6 +139,7 @@ files:
|
|
125
139
|
- config.ru
|
126
140
|
- exec/property-tagger.rb
|
127
141
|
- lib/opener/property_tagger.rb
|
142
|
+
- lib/opener/property_tagger/aspects_cache.rb
|
128
143
|
- lib/opener/property_tagger/cli.rb
|
129
144
|
- lib/opener/property_tagger/processor.rb
|
130
145
|
- lib/opener/property_tagger/public/markdown.css
|