pollex 0.0.3 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +12 -0
- data/lib/pollex.rb +9 -1
- data/lib/pollex/entry.rb +45 -2
- data/lib/pollex/pollex_class.rb +1 -1
- data/lib/pollex/scraper.rb +0 -3
- data/lib/pollex/semantic_field.rb +11 -2
- data/lib/pollex/source.rb +78 -0
- data/lib/pollex/translator.rb +49 -0
- data/lib/pollex/version.rb +1 -1
- data/pollex.gemspec +2 -0
- metadata +36 -3
data/CHANGELOG
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
3/6/2013 version 0.1.0
|
2
|
+
* New class: Pollex:Translator
|
3
|
+
* New methods:
|
4
|
+
* Entry.terms - parses the entry's description and returns an array
|
5
|
+
of English definitions for the entry
|
6
|
+
* Source.grammar - outputs information about parsing descriptions
|
7
|
+
of the source's entries
|
8
|
+
* SemanticField.find - filters SemanticField.all by name
|
9
|
+
* Including Pollex::Level
|
10
|
+
* Fixing Entry#inspect
|
11
|
+
* Error handling in PollexObject#inspect
|
12
|
+
|
1
13
|
3/5/2013 version 0.0.3
|
2
14
|
* Minor bug-fix, adding version and changelog
|
3
15
|
|
data/lib/pollex.rb
CHANGED
@@ -3,16 +3,24 @@ end
|
|
3
3
|
|
4
4
|
require 'nokogiri'
|
5
5
|
require 'lrucache'
|
6
|
+
require 'cld'
|
7
|
+
require 'json'
|
8
|
+
|
9
|
+
require 'singleton'
|
10
|
+
require 'open-uri'
|
11
|
+
require 'pp'
|
6
12
|
|
7
13
|
[
|
8
14
|
'version',
|
9
15
|
'pollex_class',
|
10
16
|
'scraper',
|
17
|
+
'translator',
|
11
18
|
'entry',
|
12
19
|
'language',
|
13
20
|
'reconstruction',
|
14
21
|
'semantic_field',
|
15
|
-
'source'
|
22
|
+
'source',
|
23
|
+
'level'
|
16
24
|
].each do |file|
|
17
25
|
require File.dirname(__FILE__) + "/pollex/#{file}.rb"
|
18
26
|
end
|
data/lib/pollex/entry.rb
CHANGED
@@ -3,7 +3,7 @@ module Pollex
|
|
3
3
|
class Entry < PollexObject
|
4
4
|
extend PollexClass
|
5
5
|
|
6
|
-
attr_accessor :reflex, :description, :flag
|
6
|
+
attr_accessor :reflex, :description, :language_name, :source_code, :reconstruction_name, :flag
|
7
7
|
attr_writer :reconstruction_name, :reconstruction_path
|
8
8
|
attr_writer :language_name, :language_path
|
9
9
|
attr_writer :source_code, :source_path
|
@@ -15,9 +15,40 @@ module Pollex
|
|
15
15
|
@reconstruction_path
|
16
16
|
end
|
17
17
|
|
18
|
+
# Processes the description of this entry and extracts a lits of
|
19
|
+
# definitions, translated into English if necessary.
|
20
|
+
# @return [Array<String>] definitions corresponding to this entry
|
21
|
+
def terms
|
22
|
+
string = @description
|
23
|
+
grammar = description_grammar
|
24
|
+
|
25
|
+
# trim last part of description, if necessary
|
26
|
+
if grammar[:trim_after]
|
27
|
+
string = string.split(grammar[:trim_after])[0]
|
28
|
+
end
|
29
|
+
|
30
|
+
# split into terms, remove any unnecessary expressions
|
31
|
+
terms = string.split(grammar[:dividers])
|
32
|
+
.map {|t| t.sub(grammar[:trim_expressions], '')
|
33
|
+
.strip
|
34
|
+
.capitalize }
|
35
|
+
.select {|t| t.match(/\w/) }
|
36
|
+
|
37
|
+
# attempt to translate to English if necessary
|
38
|
+
if grammar[:language] != 'en'
|
39
|
+
terms.map! {|t| Translator.instance.translate(t, grammar[:language], terms) }
|
40
|
+
end
|
41
|
+
|
42
|
+
terms
|
43
|
+
end
|
44
|
+
|
18
45
|
# @return [Language] the Language corresponding to this entry
|
19
46
|
def language
|
20
|
-
|
47
|
+
if @language_path
|
48
|
+
@language ||= Language.new(:name => @language_name, :path => @language_path)
|
49
|
+
else
|
50
|
+
nil
|
51
|
+
end
|
21
52
|
end
|
22
53
|
|
23
54
|
# @return [(Source, nil)] the Source corresponding to this entry, if given
|
@@ -54,5 +85,17 @@ module Pollex
|
|
54
85
|
[:flag, "td[3]/span[@class='flag']/text()"]
|
55
86
|
])
|
56
87
|
end
|
88
|
+
|
89
|
+
private
|
90
|
+
|
91
|
+
# @return [Hash] grammatical information pertaining to the description of this
|
92
|
+
# entry, used by Entry#terms
|
93
|
+
def description_grammar
|
94
|
+
if source
|
95
|
+
source.grammar
|
96
|
+
else
|
97
|
+
Source.new.grammar
|
98
|
+
end
|
99
|
+
end
|
57
100
|
end
|
58
101
|
end
|
data/lib/pollex/pollex_class.rb
CHANGED
@@ -19,7 +19,7 @@ module Pollex
|
|
19
19
|
def inspect
|
20
20
|
inspectables = self.class.inspectables
|
21
21
|
if inspectables
|
22
|
-
"#<#{self.class}:0x#{object_id.to_s(16)} " + inspectables.map {|i| "@#{i}=\"#{send(i)}\""}.join(' ') + ">"
|
22
|
+
"#<#{self.class}:0x#{object_id.to_s(16)} " + inspectables.map {|i| "@#{i}=\"#{send(i) rescue nil}\""}.join(' ') + ">"
|
23
23
|
else
|
24
24
|
super
|
25
25
|
end
|
data/lib/pollex/scraper.rb
CHANGED
@@ -28,10 +28,19 @@ module Pollex
|
|
28
28
|
])
|
29
29
|
end
|
30
30
|
|
31
|
-
# Counts the number of
|
32
|
-
# @return [Integer] number of
|
31
|
+
# Counts the number of SemanticFields within Pollex
|
32
|
+
# @return [Integer] number of SemanticFields in Pollex
|
33
33
|
def self.count
|
34
34
|
self.all.count
|
35
35
|
end
|
36
|
+
|
37
|
+
# Looks up all SemanticFields matching a given name.
|
38
|
+
# @note Pollex has no built-in search for SemanticFields, so this method is
|
39
|
+
# simply a filter over SemanticField.all.
|
40
|
+
# @param name [String] term to search for
|
41
|
+
# @return [Array<SemanticField>] array of SemanticFields matching the search term
|
42
|
+
def self.find(name)
|
43
|
+
self.all.select { |sf| sf.name.downcase.include?(name.downcase) }
|
44
|
+
end
|
36
45
|
end
|
37
46
|
end
|
data/lib/pollex/source.rb
CHANGED
@@ -13,6 +13,8 @@ module Pollex
|
|
13
13
|
@entries ||= Scraper.instance.get_all(Entry, @path, [
|
14
14
|
[:language_name, 'td[1]/a/text()'],
|
15
15
|
[:language_path, 'td[1]/a/@href'],
|
16
|
+
[:source_code, nil, lambda {|x| @code}],
|
17
|
+
[:source_path, nil, lambda {|x| @path}],
|
16
18
|
[:reflex, 'td[2]/text()'],
|
17
19
|
[:description, 'td[3]/text()'],
|
18
20
|
[:flag, "td[3]/span[@class='flag']/text()"]
|
@@ -38,6 +40,76 @@ module Pollex
|
|
38
40
|
@count ||= @entries.count
|
39
41
|
end
|
40
42
|
|
43
|
+
# Returns grammatical information for this source, used for
|
44
|
+
# intelligently parsing the descriptions of entries from this source
|
45
|
+
# @note Information is currently entered for all sources on
|
46
|
+
# http://pollex.org.nz/source/ up to (and including)
|
47
|
+
# Bse
|
48
|
+
# @return [Hash] grammatical information pertaining to the descriptions
|
49
|
+
# of this sources' entries
|
50
|
+
# @see Entry#terms
|
51
|
+
def grammar
|
52
|
+
# first, assume reasonable defaults
|
53
|
+
|
54
|
+
language = 'en' # default language: English
|
55
|
+
dividers = /[,;]/ # default: split on comma and semicolon
|
56
|
+
trim_expressions = '' # default: don't trim any expressions
|
57
|
+
trim_after = nil # default: don't trim any trailing text
|
58
|
+
|
59
|
+
# now bring in source-specific information
|
60
|
+
|
61
|
+
if ['Cnt', 'Bxn'].include? @code
|
62
|
+
# Spanish-language sources
|
63
|
+
language = 'es'
|
64
|
+
elsif ['Aca', 'Bgn', 'Btn', 'Hmn', 'Rch'].include? @code
|
65
|
+
# French-language sources
|
66
|
+
language = 'fr'
|
67
|
+
end
|
68
|
+
|
69
|
+
if ['Aca', 'Bxn'].include? @code
|
70
|
+
# split by comma, semicolon, period
|
71
|
+
dividers = /(,|;|\. )/
|
72
|
+
elsif ['Atn', 'Bwh', 'Hmn'].include? @code
|
73
|
+
# don't split at all
|
74
|
+
dividers = '\n' # dividers = nil doesn't work
|
75
|
+
elsif ['Bgn', 'Bst', 'Brn'].include? @code
|
76
|
+
# split by period
|
77
|
+
dividers = '.'
|
78
|
+
elsif ['Bkr', 'Bgs'].include? @code
|
79
|
+
# split by comma, period
|
80
|
+
dividers = /(,|\. )/
|
81
|
+
elsif ['Bge', 'Bck'].include? @code
|
82
|
+
# split by semicolon
|
83
|
+
dividers = ';'
|
84
|
+
end
|
85
|
+
|
86
|
+
if ['McP', 'Dsn'].include? @code
|
87
|
+
# Trim all (parenthetical expressions)
|
88
|
+
trim_expressions = /\(.*\)/
|
89
|
+
elsif ['Cnt', 'Aca', 'Bse', 'Hmn'].include? @code
|
90
|
+
# Trim parenthetical expressions that are <= 4 chars or contain numbers
|
91
|
+
trim_expressions = /\((.{0,4}|.*[0-9].*)\)/
|
92
|
+
elsif ['Stz', 'Bck'].include? @code
|
93
|
+
# Trim parenthetical expressions that contain numbers
|
94
|
+
trim_expressions = /\(.*[0-9].*\)/
|
95
|
+
elsif ['Rsr'].include? @code
|
96
|
+
# Trim all "expressions in quotes"
|
97
|
+
trim_expressions = /".*"/
|
98
|
+
end
|
99
|
+
|
100
|
+
if ['Btl', 'Bck'].include? @code
|
101
|
+
# Trim everything after a period
|
102
|
+
trim_after = '.'
|
103
|
+
end
|
104
|
+
|
105
|
+
{
|
106
|
+
:language => language,
|
107
|
+
:dividers => dividers,
|
108
|
+
:trim_expressions => trim_expressions,
|
109
|
+
:trim_after => trim_after
|
110
|
+
}
|
111
|
+
end
|
112
|
+
|
41
113
|
# Returns all Sources in Pollex.
|
42
114
|
# @return [Array<Source>] array of Sources in Pollex
|
43
115
|
def self.all
|
@@ -49,5 +121,11 @@ module Pollex
|
|
49
121
|
[:reference, 'td[4]/text()']
|
50
122
|
])
|
51
123
|
end
|
124
|
+
|
125
|
+
# Counts the number of Sources within Pollex
|
126
|
+
# @return [Integer] number of Sources in Pollex
|
127
|
+
def self.count
|
128
|
+
self.all.count
|
129
|
+
end
|
52
130
|
end
|
53
131
|
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Pollex
|
2
|
+
# Singleton object for translating descriptions into English.
|
3
|
+
class Translator
|
4
|
+
include Singleton
|
5
|
+
|
6
|
+
# Instantiates a cache of size 100 for storing translations.
|
7
|
+
def initialize()
|
8
|
+
@cache = LRUCache.new(:max_size => 2500, :default => nil)
|
9
|
+
end
|
10
|
+
|
11
|
+
# Translates a phrase into English using the free MyMemory API, and caches
|
12
|
+
# the result.
|
13
|
+
# @note MyMemory currently has a limit of 100 API requests per IP per day for
|
14
|
+
# unregistered users.
|
15
|
+
# @param phrase [String] Phrase to be translated
|
16
|
+
# @param source_lang_code [String] Two-letter language code for the source language
|
17
|
+
# @param context [Array<String>] Adjoining phrases (optional)
|
18
|
+
# @result [String] Translated phrase
|
19
|
+
def translate(phrase, source_lang_code, context = nil)
|
20
|
+
context ||= [phrase]
|
21
|
+
if context.all? {|x| CLD.detect_language(x)[:code] == 'en'}
|
22
|
+
# we are reasonably sure that this phrase is already in English - no need to translate
|
23
|
+
phrase
|
24
|
+
else
|
25
|
+
# first, check the cache
|
26
|
+
key = [phrase, source_lang_code]
|
27
|
+
if @cache[key]
|
28
|
+
@cache[key]
|
29
|
+
else
|
30
|
+
# make a request to MyMemory
|
31
|
+
puts "Translating '#{phrase}' ..."
|
32
|
+
url = "http://mymemory.translated.net/api/get?q=#{URI::encode(phrase)}&langpair=#{source_lang_code}%7Cen"
|
33
|
+
results_json = open(url).read
|
34
|
+
result = JSON.parse(results_json)['responseData']['translatedText']
|
35
|
+
|
36
|
+
if result.include? 'MYMEMORY WARNING'
|
37
|
+
# translation failed - return original phrase
|
38
|
+
puts result
|
39
|
+
phrase
|
40
|
+
else
|
41
|
+
# translation succeeded - store into cache and return translated phrase
|
42
|
+
@cache[key] = result
|
43
|
+
result
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
data/lib/pollex/version.rb
CHANGED
data/pollex.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pollex
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-03-
|
12
|
+
date: 2013-03-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -43,6 +43,38 @@ dependencies:
|
|
43
43
|
- - ! '>='
|
44
44
|
- !ruby/object:Gem::Version
|
45
45
|
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: cld
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: json
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
46
78
|
description: Ruby wrapper for scraping pollex (the Polynesian Lexicon Project)
|
47
79
|
email: alex.nisnevich@gmail.com
|
48
80
|
executables: []
|
@@ -60,6 +92,7 @@ files:
|
|
60
92
|
- lib/pollex/scraper.rb
|
61
93
|
- lib/pollex/semantic_field.rb
|
62
94
|
- lib/pollex/source.rb
|
95
|
+
- lib/pollex/translator.rb
|
63
96
|
- lib/pollex/version.rb
|
64
97
|
- pollex.gemspec
|
65
98
|
homepage: http://github.com/AlexNisnevich/pollex
|
@@ -82,7 +115,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
82
115
|
version: '0'
|
83
116
|
requirements: []
|
84
117
|
rubyforge_project:
|
85
|
-
rubygems_version: 1.8.
|
118
|
+
rubygems_version: 1.8.23
|
86
119
|
signing_key:
|
87
120
|
specification_version: 3
|
88
121
|
summary: Ruby wrapper for scraping pollex (the Polynesian Lexicon Project)
|