pollex 0.0.3 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +12 -0
- data/lib/pollex.rb +9 -1
- data/lib/pollex/entry.rb +45 -2
- data/lib/pollex/pollex_class.rb +1 -1
- data/lib/pollex/scraper.rb +0 -3
- data/lib/pollex/semantic_field.rb +11 -2
- data/lib/pollex/source.rb +78 -0
- data/lib/pollex/translator.rb +49 -0
- data/lib/pollex/version.rb +1 -1
- data/pollex.gemspec +2 -0
- metadata +36 -3
data/CHANGELOG
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
3/6/2013 version 0.1.0
|
2
|
+
* New class: Pollex:Translator
|
3
|
+
* New methods:
|
4
|
+
* Entry.terms - parses the entry's description and returns an array
|
5
|
+
of English definitions for the entry
|
6
|
+
* Source.grammar - outputs information about parsing descriptions
|
7
|
+
of the source's entries
|
8
|
+
* SemanticField.find - filters SemanticField.all by name
|
9
|
+
* Including Pollex::Level
|
10
|
+
* Fixing Entry#inspect
|
11
|
+
* Error handling in PollexObject#inspect
|
12
|
+
|
1
13
|
3/5/2013 version 0.0.3
|
2
14
|
* Minor bug-fix, adding version and changelog
|
3
15
|
|
data/lib/pollex.rb
CHANGED
@@ -3,16 +3,24 @@ end
|
|
3
3
|
|
4
4
|
require 'nokogiri'
|
5
5
|
require 'lrucache'
|
6
|
+
require 'cld'
|
7
|
+
require 'json'
|
8
|
+
|
9
|
+
require 'singleton'
|
10
|
+
require 'open-uri'
|
11
|
+
require 'pp'
|
6
12
|
|
7
13
|
[
|
8
14
|
'version',
|
9
15
|
'pollex_class',
|
10
16
|
'scraper',
|
17
|
+
'translator',
|
11
18
|
'entry',
|
12
19
|
'language',
|
13
20
|
'reconstruction',
|
14
21
|
'semantic_field',
|
15
|
-
'source'
|
22
|
+
'source',
|
23
|
+
'level'
|
16
24
|
].each do |file|
|
17
25
|
require File.dirname(__FILE__) + "/pollex/#{file}.rb"
|
18
26
|
end
|
data/lib/pollex/entry.rb
CHANGED
@@ -3,7 +3,7 @@ module Pollex
|
|
3
3
|
class Entry < PollexObject
|
4
4
|
extend PollexClass
|
5
5
|
|
6
|
-
attr_accessor :reflex, :description, :flag
|
6
|
+
attr_accessor :reflex, :description, :language_name, :source_code, :reconstruction_name, :flag
|
7
7
|
attr_writer :reconstruction_name, :reconstruction_path
|
8
8
|
attr_writer :language_name, :language_path
|
9
9
|
attr_writer :source_code, :source_path
|
@@ -15,9 +15,40 @@ module Pollex
|
|
15
15
|
@reconstruction_path
|
16
16
|
end
|
17
17
|
|
18
|
+
# Processes the description of this entry and extracts a lits of
|
19
|
+
# definitions, translated into English if necessary.
|
20
|
+
# @return [Array<String>] definitions corresponding to this entry
|
21
|
+
def terms
|
22
|
+
string = @description
|
23
|
+
grammar = description_grammar
|
24
|
+
|
25
|
+
# trim last part of description, if necessary
|
26
|
+
if grammar[:trim_after]
|
27
|
+
string = string.split(grammar[:trim_after])[0]
|
28
|
+
end
|
29
|
+
|
30
|
+
# split into terms, remove any unnecessary expressions
|
31
|
+
terms = string.split(grammar[:dividers])
|
32
|
+
.map {|t| t.sub(grammar[:trim_expressions], '')
|
33
|
+
.strip
|
34
|
+
.capitalize }
|
35
|
+
.select {|t| t.match(/\w/) }
|
36
|
+
|
37
|
+
# attempt to translate to English if necessary
|
38
|
+
if grammar[:language] != 'en'
|
39
|
+
terms.map! {|t| Translator.instance.translate(t, grammar[:language], terms) }
|
40
|
+
end
|
41
|
+
|
42
|
+
terms
|
43
|
+
end
|
44
|
+
|
18
45
|
# @return [Language] the Language corresponding to this entry
|
19
46
|
def language
|
20
|
-
|
47
|
+
if @language_path
|
48
|
+
@language ||= Language.new(:name => @language_name, :path => @language_path)
|
49
|
+
else
|
50
|
+
nil
|
51
|
+
end
|
21
52
|
end
|
22
53
|
|
23
54
|
# @return [(Source, nil)] the Source corresponding to this entry, if given
|
@@ -54,5 +85,17 @@ module Pollex
|
|
54
85
|
[:flag, "td[3]/span[@class='flag']/text()"]
|
55
86
|
])
|
56
87
|
end
|
88
|
+
|
89
|
+
private
|
90
|
+
|
91
|
+
# @return [Hash] grammatical information pertaining to the description of this
|
92
|
+
# entry, used by Entry#terms
|
93
|
+
def description_grammar
|
94
|
+
if source
|
95
|
+
source.grammar
|
96
|
+
else
|
97
|
+
Source.new.grammar
|
98
|
+
end
|
99
|
+
end
|
57
100
|
end
|
58
101
|
end
|
data/lib/pollex/pollex_class.rb
CHANGED
@@ -19,7 +19,7 @@ module Pollex
|
|
19
19
|
def inspect
|
20
20
|
inspectables = self.class.inspectables
|
21
21
|
if inspectables
|
22
|
-
"#<#{self.class}:0x#{object_id.to_s(16)} " + inspectables.map {|i| "@#{i}=\"#{send(i)}\""}.join(' ') + ">"
|
22
|
+
"#<#{self.class}:0x#{object_id.to_s(16)} " + inspectables.map {|i| "@#{i}=\"#{send(i) rescue nil}\""}.join(' ') + ">"
|
23
23
|
else
|
24
24
|
super
|
25
25
|
end
|
data/lib/pollex/scraper.rb
CHANGED
@@ -28,10 +28,19 @@ module Pollex
|
|
28
28
|
])
|
29
29
|
end
|
30
30
|
|
31
|
-
# Counts the number of
|
32
|
-
# @return [Integer] number of
|
31
|
+
# Counts the number of SemanticFields within Pollex
|
32
|
+
# @return [Integer] number of SemanticFields in Pollex
|
33
33
|
def self.count
|
34
34
|
self.all.count
|
35
35
|
end
|
36
|
+
|
37
|
+
# Looks up all SemanticFields matching a given name.
|
38
|
+
# @note Pollex has no built-in search for SemanticFields, so this method is
|
39
|
+
# simply a filter over SemanticField.all.
|
40
|
+
# @param name [String] term to search for
|
41
|
+
# @return [Array<SemanticField>] array of SemanticFields matching the search term
|
42
|
+
def self.find(name)
|
43
|
+
self.all.select { |sf| sf.name.downcase.include?(name.downcase) }
|
44
|
+
end
|
36
45
|
end
|
37
46
|
end
|
data/lib/pollex/source.rb
CHANGED
@@ -13,6 +13,8 @@ module Pollex
|
|
13
13
|
@entries ||= Scraper.instance.get_all(Entry, @path, [
|
14
14
|
[:language_name, 'td[1]/a/text()'],
|
15
15
|
[:language_path, 'td[1]/a/@href'],
|
16
|
+
[:source_code, nil, lambda {|x| @code}],
|
17
|
+
[:source_path, nil, lambda {|x| @path}],
|
16
18
|
[:reflex, 'td[2]/text()'],
|
17
19
|
[:description, 'td[3]/text()'],
|
18
20
|
[:flag, "td[3]/span[@class='flag']/text()"]
|
@@ -38,6 +40,76 @@ module Pollex
|
|
38
40
|
@count ||= @entries.count
|
39
41
|
end
|
40
42
|
|
43
|
+
# Returns grammatical information for this source, used for
|
44
|
+
# intelligently parsing the descriptions of entries from this source
|
45
|
+
# @note Information is currently entered for all sources on
|
46
|
+
# http://pollex.org.nz/source/ up to (and including)
|
47
|
+
# Bse
|
48
|
+
# @return [Hash] grammatical information pertaining to the descriptions
|
49
|
+
# of this sources' entries
|
50
|
+
# @see Entry#terms
|
51
|
+
def grammar
|
52
|
+
# first, assume reasonable defaults
|
53
|
+
|
54
|
+
language = 'en' # default language: English
|
55
|
+
dividers = /[,;]/ # default: split on comma and semicolon
|
56
|
+
trim_expressions = '' # default: don't trim any expressions
|
57
|
+
trim_after = nil # default: don't trim any trailing text
|
58
|
+
|
59
|
+
# now bring in source-specific information
|
60
|
+
|
61
|
+
if ['Cnt', 'Bxn'].include? @code
|
62
|
+
# Spanish-language sources
|
63
|
+
language = 'es'
|
64
|
+
elsif ['Aca', 'Bgn', 'Btn', 'Hmn', 'Rch'].include? @code
|
65
|
+
# French-language sources
|
66
|
+
language = 'fr'
|
67
|
+
end
|
68
|
+
|
69
|
+
if ['Aca', 'Bxn'].include? @code
|
70
|
+
# split by comma, semicolon, period
|
71
|
+
dividers = /(,|;|\. )/
|
72
|
+
elsif ['Atn', 'Bwh', 'Hmn'].include? @code
|
73
|
+
# don't split at all
|
74
|
+
dividers = '\n' # dividers = nil doesn't work
|
75
|
+
elsif ['Bgn', 'Bst', 'Brn'].include? @code
|
76
|
+
# split by period
|
77
|
+
dividers = '.'
|
78
|
+
elsif ['Bkr', 'Bgs'].include? @code
|
79
|
+
# split by comma, period
|
80
|
+
dividers = /(,|\. )/
|
81
|
+
elsif ['Bge', 'Bck'].include? @code
|
82
|
+
# split by semicolon
|
83
|
+
dividers = ';'
|
84
|
+
end
|
85
|
+
|
86
|
+
if ['McP', 'Dsn'].include? @code
|
87
|
+
# Trim all (parenthetical expressions)
|
88
|
+
trim_expressions = /\(.*\)/
|
89
|
+
elsif ['Cnt', 'Aca', 'Bse', 'Hmn'].include? @code
|
90
|
+
# Trim parenthetical expressions that are <= 4 chars or contain numbers
|
91
|
+
trim_expressions = /\((.{0,4}|.*[0-9].*)\)/
|
92
|
+
elsif ['Stz', 'Bck'].include? @code
|
93
|
+
# Trim parenthetical expressions that contain numbers
|
94
|
+
trim_expressions = /\(.*[0-9].*\)/
|
95
|
+
elsif ['Rsr'].include? @code
|
96
|
+
# Trim all "expressions in quotes"
|
97
|
+
trim_expressions = /".*"/
|
98
|
+
end
|
99
|
+
|
100
|
+
if ['Btl', 'Bck'].include? @code
|
101
|
+
# Trim everything after a period
|
102
|
+
trim_after = '.'
|
103
|
+
end
|
104
|
+
|
105
|
+
{
|
106
|
+
:language => language,
|
107
|
+
:dividers => dividers,
|
108
|
+
:trim_expressions => trim_expressions,
|
109
|
+
:trim_after => trim_after
|
110
|
+
}
|
111
|
+
end
|
112
|
+
|
41
113
|
# Returns all Sources in Pollex.
|
42
114
|
# @return [Array<Source>] array of Sources in Pollex
|
43
115
|
def self.all
|
@@ -49,5 +121,11 @@ module Pollex
|
|
49
121
|
[:reference, 'td[4]/text()']
|
50
122
|
])
|
51
123
|
end
|
124
|
+
|
125
|
+
# Counts the number of Sources within Pollex
|
126
|
+
# @return [Integer] number of Sources in Pollex
|
127
|
+
def self.count
|
128
|
+
self.all.count
|
129
|
+
end
|
52
130
|
end
|
53
131
|
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Pollex
|
2
|
+
# Singleton object for translating descriptions into English.
|
3
|
+
class Translator
|
4
|
+
include Singleton
|
5
|
+
|
6
|
+
# Instantiates a cache of size 100 for storing translations.
|
7
|
+
def initialize()
|
8
|
+
@cache = LRUCache.new(:max_size => 2500, :default => nil)
|
9
|
+
end
|
10
|
+
|
11
|
+
# Translates a phrase into English using the free MyMemory API, and caches
|
12
|
+
# the result.
|
13
|
+
# @note MyMemory currently has a limit of 100 API requests per IP per day for
|
14
|
+
# unregistered users.
|
15
|
+
# @param phrase [String] Phrase to be translated
|
16
|
+
# @param source_lang_code [String] Two-letter language code for the source language
|
17
|
+
# @param context [Array<String>] Adjoining phrases (optional)
|
18
|
+
# @result [String] Translated phrase
|
19
|
+
def translate(phrase, source_lang_code, context = nil)
|
20
|
+
context ||= [phrase]
|
21
|
+
if context.all? {|x| CLD.detect_language(x)[:code] == 'en'}
|
22
|
+
# we are reasonably sure that this phrase is already in English - no need to translate
|
23
|
+
phrase
|
24
|
+
else
|
25
|
+
# first, check the cache
|
26
|
+
key = [phrase, source_lang_code]
|
27
|
+
if @cache[key]
|
28
|
+
@cache[key]
|
29
|
+
else
|
30
|
+
# make a request to MyMemory
|
31
|
+
puts "Translating '#{phrase}' ..."
|
32
|
+
url = "http://mymemory.translated.net/api/get?q=#{URI::encode(phrase)}&langpair=#{source_lang_code}%7Cen"
|
33
|
+
results_json = open(url).read
|
34
|
+
result = JSON.parse(results_json)['responseData']['translatedText']
|
35
|
+
|
36
|
+
if result.include? 'MYMEMORY WARNING'
|
37
|
+
# translation failed - return original phrase
|
38
|
+
puts result
|
39
|
+
phrase
|
40
|
+
else
|
41
|
+
# translation succeeded - store into cache and return translated phrase
|
42
|
+
@cache[key] = result
|
43
|
+
result
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
data/lib/pollex/version.rb
CHANGED
data/pollex.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pollex
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-03-
|
12
|
+
date: 2013-03-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -43,6 +43,38 @@ dependencies:
|
|
43
43
|
- - ! '>='
|
44
44
|
- !ruby/object:Gem::Version
|
45
45
|
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: cld
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: json
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
46
78
|
description: Ruby wrapper for scraping pollex (the Polynesian Lexicon Project)
|
47
79
|
email: alex.nisnevich@gmail.com
|
48
80
|
executables: []
|
@@ -60,6 +92,7 @@ files:
|
|
60
92
|
- lib/pollex/scraper.rb
|
61
93
|
- lib/pollex/semantic_field.rb
|
62
94
|
- lib/pollex/source.rb
|
95
|
+
- lib/pollex/translator.rb
|
63
96
|
- lib/pollex/version.rb
|
64
97
|
- pollex.gemspec
|
65
98
|
homepage: http://github.com/AlexNisnevich/pollex
|
@@ -82,7 +115,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
82
115
|
version: '0'
|
83
116
|
requirements: []
|
84
117
|
rubyforge_project:
|
85
|
-
rubygems_version: 1.8.
|
118
|
+
rubygems_version: 1.8.23
|
86
119
|
signing_key:
|
87
120
|
specification_version: 3
|
88
121
|
summary: Ruby wrapper for scraping pollex (the Polynesian Lexicon Project)
|