opener-property-tagger 3.0.5 → 3.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/opener/property_tagger.rb +7 -14
- data/lib/opener/property_tagger/aspects_cache.rb +49 -0
- data/lib/opener/property_tagger/cli.rb +4 -1
- data/lib/opener/property_tagger/processor.rb +38 -33
- data/lib/opener/property_tagger/version.rb +1 -1
- data/opener-property-tagger.gemspec +1 -0
- data/task/test.rake +2 -1
- metadata +17 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c3d36e45bbff187579cbb8f2275d53261ed13030
|
4
|
+
data.tar.gz: 2850b27e9876336083dde1af17083658be60ef1c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f95a22d030ff6dc1685ee117ddae2f0d104fce5b187e7bb2062b96dd98ba035dd55c6b9f21266a63e968ff42d9e0869ce762786c7a2d7f5b53b6cc63d5aff4a2
|
7
|
+
data.tar.gz: be9bfed3bb90e26a111e03973a3ec543780e886be08d7102dad057e81917e1323c599198f81d79da58257d85e5adafaf507454a0fa6e30105fc8210698c94e17
|
@@ -1,12 +1,14 @@
|
|
1
1
|
require 'open3'
|
2
2
|
require 'slop'
|
3
3
|
require 'oga'
|
4
|
+
require 'monitor'
|
4
5
|
|
5
6
|
require 'rexml/document'
|
6
7
|
require 'rexml/formatters/pretty'
|
7
8
|
|
8
9
|
require_relative 'property_tagger/version'
|
9
10
|
require_relative 'property_tagger/cli'
|
11
|
+
require_relative 'property_tagger/aspects_cache'
|
10
12
|
require_relative 'property_tagger/processor'
|
11
13
|
|
12
14
|
module Opener
|
@@ -52,24 +54,15 @@ module Opener
|
|
52
54
|
end
|
53
55
|
|
54
56
|
##
|
55
|
-
# Processes the input
|
56
|
-
# STDERR and an object containing process information.
|
57
|
+
# Processes the input KAF document.
|
57
58
|
#
|
58
|
-
# @param [String] input
|
59
|
-
# @return [
|
59
|
+
# @param [String] input
|
60
|
+
# @return [String]
|
60
61
|
#
|
61
62
|
def run(input)
|
62
|
-
|
63
|
-
|
64
|
-
return output
|
65
|
-
end
|
66
|
-
|
67
|
-
protected
|
68
|
-
|
69
|
-
def process(input)
|
70
|
-
processor = Processor.new(input, path, !options[:no_time])
|
63
|
+
timestamp = !options[:no_time]
|
71
64
|
|
72
|
-
return
|
65
|
+
return Processor.new(input, path, timestamp, options[:pretty]).process
|
73
66
|
end
|
74
67
|
end # PolarityTagger
|
75
68
|
end # Opener
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Opener
|
2
|
+
class PropertyTagger
|
3
|
+
##
|
4
|
+
# Thread-safe cache for storing the contents of aspect files.
|
5
|
+
#
|
6
|
+
class AspectsCache
|
7
|
+
include MonitorMixin
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
super
|
11
|
+
|
12
|
+
@cache = {}
|
13
|
+
end
|
14
|
+
|
15
|
+
##
|
16
|
+
# Returns the aspects for the given file path. If the aspects don't exist
|
17
|
+
# they are first loaded into the cache.
|
18
|
+
#
|
19
|
+
# @param [String] path
|
20
|
+
#
|
21
|
+
def [](path)
|
22
|
+
synchronize do
|
23
|
+
@cache[path] = load_aspects(path) unless @cache.key?(path)
|
24
|
+
end
|
25
|
+
|
26
|
+
return @cache[path]
|
27
|
+
end
|
28
|
+
|
29
|
+
alias_method :get, :[]
|
30
|
+
|
31
|
+
##
|
32
|
+
# Loads the aspects of the given path.
|
33
|
+
#
|
34
|
+
# @param [String] path
|
35
|
+
#
|
36
|
+
def load_aspects(path)
|
37
|
+
mapping = Hash.new { |hash, key| hash[key] = [] }
|
38
|
+
|
39
|
+
File.foreach(path) do |line|
|
40
|
+
lemma, pos, aspect = line.chomp.split("\t")
|
41
|
+
|
42
|
+
mapping[lemma.to_sym] << aspect
|
43
|
+
end
|
44
|
+
|
45
|
+
return mapping
|
46
|
+
end
|
47
|
+
end # AspectsCache
|
48
|
+
end # PropertyTagger
|
49
|
+
end # Opener
|
@@ -56,10 +56,13 @@ Examples:
|
|
56
56
|
|
57
57
|
on :'no-time', 'Disables adding of timestamps'
|
58
58
|
|
59
|
+
on :ugly, 'Disables pretty formatting of XML (faster)'
|
60
|
+
|
59
61
|
run do |opts, args|
|
60
62
|
tagger = PropertyTagger.new(
|
61
63
|
:args => args,
|
62
|
-
:no_time => opts[:'no-time']
|
64
|
+
:no_time => opts[:'no-time'],
|
65
|
+
:pretty => !opts[:ugly]
|
63
66
|
)
|
64
67
|
|
65
68
|
input = STDIN.tty? ? nil : STDIN.read
|
@@ -4,13 +4,27 @@ module Opener
|
|
4
4
|
# Class that applies property tagging to a given input KAF file.
|
5
5
|
#
|
6
6
|
class Processor
|
7
|
-
attr_accessor :document, :aspects_path, :
|
8
|
-
:timestamp
|
7
|
+
attr_accessor :document, :aspects_path, :timestamp, :pretty
|
9
8
|
|
10
|
-
|
9
|
+
##
|
10
|
+
# Global cache used for storing loaded aspects.
|
11
|
+
#
|
12
|
+
# @return [Opener::PropertyTagger::AspectsCache.new]
|
13
|
+
#
|
14
|
+
ASPECTS_CACHE = AspectsCache.new
|
15
|
+
|
16
|
+
##
|
17
|
+
# @param [String|IO] file The KAF file/input to process.
|
18
|
+
# @param [String] aspects_path Path to the aspects.
|
19
|
+
# @param [TrueClass|FalseClass] timestamp Add timestamps to the KAF.
|
20
|
+
# @param [TrueClass|FalseClass] pretty Enable pretty formatting, disabled
|
21
|
+
# by default due to the performance overhead.
|
22
|
+
#
|
23
|
+
def initialize(file, aspects_path, timestamp = true, pretty = false)
|
11
24
|
@document = Oga.parse_xml(file)
|
12
25
|
@aspects_path = aspects_path
|
13
26
|
@timestamp = timestamp
|
27
|
+
@pretty = pretty
|
14
28
|
|
15
29
|
raise 'Error parsing input. Input is required to be KAF' unless is_kaf?
|
16
30
|
end
|
@@ -20,64 +34,52 @@ module Opener
|
|
20
34
|
# @return [String]
|
21
35
|
#
|
22
36
|
def process
|
23
|
-
@language = get_language
|
24
|
-
@aspects = load_aspects
|
25
|
-
@terms = get_terms
|
26
|
-
|
27
37
|
existing_aspects = extract_aspects
|
28
38
|
|
29
39
|
add_features_layer
|
30
40
|
add_properties_layer
|
31
41
|
|
32
|
-
|
42
|
+
existing_aspects.each_with_index do |(key, value), index|
|
43
|
+
index += 1
|
33
44
|
|
34
|
-
existing_aspects.each_pair do |key,value|
|
35
45
|
add_property(key, value, index)
|
36
|
-
index += 1
|
37
46
|
end
|
38
47
|
|
39
48
|
add_linguistic_processor
|
40
49
|
|
41
|
-
return pretty_print(document)
|
50
|
+
return pretty ? pretty_print(document) : document.to_xml
|
42
51
|
end
|
43
52
|
|
44
53
|
##
|
45
|
-
# Loads the aspects from the txt file
|
46
54
|
# @return [Hash]
|
47
55
|
#
|
48
|
-
def
|
49
|
-
|
50
|
-
|
51
|
-
File.foreach(aspects_file) do |line|
|
52
|
-
lemma, pos, aspect = line.gsub("\n", "").split("\t")
|
53
|
-
|
54
|
-
aspects_hash[lemma.to_sym] = [] unless aspects_hash[lemma.to_sym]
|
55
|
-
aspects_hash[lemma.to_sym] << aspect
|
56
|
-
end
|
57
|
-
|
58
|
-
return aspects_hash
|
56
|
+
def aspects
|
57
|
+
return ASPECTS_CACHE[aspects_file]
|
59
58
|
end
|
60
59
|
|
61
60
|
##
|
62
61
|
# Get the language of the input file.
|
62
|
+
#
|
63
63
|
# @return [String]
|
64
64
|
#
|
65
|
-
def
|
66
|
-
document.at_xpath('KAF').get('xml:lang')
|
65
|
+
def language
|
66
|
+
return @language ||= document.at_xpath('KAF').get('xml:lang')
|
67
67
|
end
|
68
68
|
|
69
69
|
##
|
70
70
|
# Get the terms from the input file
|
71
71
|
# @return [Hash]
|
72
72
|
#
|
73
|
-
def
|
74
|
-
|
73
|
+
def terms
|
74
|
+
unless @terms
|
75
|
+
@terms = {}
|
75
76
|
|
76
|
-
|
77
|
-
|
77
|
+
document.xpath('KAF/terms/term').each do |term|
|
78
|
+
@terms[term.get('tid').to_sym] = term.get('lemma')
|
79
|
+
end
|
78
80
|
end
|
79
81
|
|
80
|
-
return
|
82
|
+
return @terms
|
81
83
|
end
|
82
84
|
|
83
85
|
##
|
@@ -93,7 +95,7 @@ module Opener
|
|
93
95
|
# lemmas) belong to a property.
|
94
96
|
max_ngram = 2
|
95
97
|
|
96
|
-
uniq_aspects = {}
|
98
|
+
uniq_aspects = Hash.new { |hash, key| hash[key] = [] }
|
97
99
|
|
98
100
|
while current_token < terms.count
|
99
101
|
(0..max_ngram).each do |tam_ngram|
|
@@ -107,7 +109,6 @@ module Opener
|
|
107
109
|
properties.uniq.each do |property|
|
108
110
|
next if !property or property.strip.empty?
|
109
111
|
|
110
|
-
uniq_aspects[property.to_sym] = [] unless uniq_aspects[property.to_sym]
|
111
112
|
uniq_aspects[property.to_sym] << [ids,ngram]
|
112
113
|
end
|
113
114
|
end
|
@@ -223,8 +224,12 @@ module Opener
|
|
223
224
|
return !!document.at_xpath('KAF')
|
224
225
|
end
|
225
226
|
|
227
|
+
##
|
228
|
+
# @return [String]
|
229
|
+
#
|
226
230
|
def aspects_file
|
227
|
-
return
|
231
|
+
return @aspects_file ||=
|
232
|
+
File.expand_path("#{aspects_path}/#{language}.txt", __FILE__)
|
228
233
|
end
|
229
234
|
end # Processor
|
230
235
|
end # PropertyTagger
|
data/task/test.rake
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-property-tagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 3.0.
|
4
|
+
version: 3.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-02-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: opener-daemons
|
@@ -108,6 +108,20 @@ dependencies:
|
|
108
108
|
- - ">="
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: benchmark-ips
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '2.0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '2.0'
|
111
125
|
description: Property tagger for hotels in Dutch and English.
|
112
126
|
email:
|
113
127
|
executables:
|
@@ -125,6 +139,7 @@ files:
|
|
125
139
|
- config.ru
|
126
140
|
- exec/property-tagger.rb
|
127
141
|
- lib/opener/property_tagger.rb
|
142
|
+
- lib/opener/property_tagger/aspects_cache.rb
|
128
143
|
- lib/opener/property_tagger/cli.rb
|
129
144
|
- lib/opener/property_tagger/processor.rb
|
130
145
|
- lib/opener/property_tagger/public/markdown.css
|