pollex 0.0.3 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,15 @@
1
+ 3/6/2013 version 0.1.0
2
+ * New class: Pollex:Translator
3
+ * New methods:
4
+ * Entry.terms - parses the entry's description and returns an array
5
+ of English definitions for the entry
6
+ * Source.grammar - outputs information about parsing descriptions
7
+ of the source's entries
8
+ * SemanticField.find - filters SemanticField.all by name
9
+ * Including Pollex::Level
10
+ * Fixing Entry#inspect
11
+ * Error handling in PollexObject#inspect
12
+
1
13
  3/5/2013 version 0.0.3
2
14
  * Minor bug-fix, adding version and changelog
3
15
 
@@ -3,16 +3,24 @@ end
3
3
 
4
4
  require 'nokogiri'
5
5
  require 'lrucache'
6
+ require 'cld'
7
+ require 'json'
8
+
9
+ require 'singleton'
10
+ require 'open-uri'
11
+ require 'pp'
6
12
 
7
13
  [
8
14
  'version',
9
15
  'pollex_class',
10
16
  'scraper',
17
+ 'translator',
11
18
  'entry',
12
19
  'language',
13
20
  'reconstruction',
14
21
  'semantic_field',
15
- 'source'
22
+ 'source',
23
+ 'level'
16
24
  ].each do |file|
17
25
  require File.dirname(__FILE__) + "/pollex/#{file}.rb"
18
26
  end
@@ -3,7 +3,7 @@ module Pollex
3
3
  class Entry < PollexObject
4
4
  extend PollexClass
5
5
 
6
- attr_accessor :reflex, :description, :flag
6
+ attr_accessor :reflex, :description, :language_name, :source_code, :reconstruction_name, :flag
7
7
  attr_writer :reconstruction_name, :reconstruction_path
8
8
  attr_writer :language_name, :language_path
9
9
  attr_writer :source_code, :source_path
@@ -15,9 +15,40 @@ module Pollex
15
15
  @reconstruction_path
16
16
  end
17
17
 
18
+ # Processes the description of this entry and extracts a lits of
19
+ # definitions, translated into English if necessary.
20
+ # @return [Array<String>] definitions corresponding to this entry
21
+ def terms
22
+ string = @description
23
+ grammar = description_grammar
24
+
25
+ # trim last part of description, if necessary
26
+ if grammar[:trim_after]
27
+ string = string.split(grammar[:trim_after])[0]
28
+ end
29
+
30
+ # split into terms, remove any unnecessary expressions
31
+ terms = string.split(grammar[:dividers])
32
+ .map {|t| t.sub(grammar[:trim_expressions], '')
33
+ .strip
34
+ .capitalize }
35
+ .select {|t| t.match(/\w/) }
36
+
37
+ # attempt to translate to English if necessary
38
+ if grammar[:language] != 'en'
39
+ terms.map! {|t| Translator.instance.translate(t, grammar[:language], terms) }
40
+ end
41
+
42
+ terms
43
+ end
44
+
18
45
  # @return [Language] the Language corresponding to this entry
19
46
  def language
20
- @language ||= Language.new(:name => @language_name, :path => @language_path)
47
+ if @language_path
48
+ @language ||= Language.new(:name => @language_name, :path => @language_path)
49
+ else
50
+ nil
51
+ end
21
52
  end
22
53
 
23
54
  # @return [(Source, nil)] the Source corresponding to this entry, if given
@@ -54,5 +85,17 @@ module Pollex
54
85
  [:flag, "td[3]/span[@class='flag']/text()"]
55
86
  ])
56
87
  end
88
+
89
+ private
90
+
91
+ # @return [Hash] grammatical information pertaining to the description of this
92
+ # entry, used by Entry#terms
93
+ def description_grammar
94
+ if source
95
+ source.grammar
96
+ else
97
+ Source.new.grammar
98
+ end
99
+ end
57
100
  end
58
101
  end
@@ -19,7 +19,7 @@ module Pollex
19
19
  def inspect
20
20
  inspectables = self.class.inspectables
21
21
  if inspectables
22
- "#<#{self.class}:0x#{object_id.to_s(16)} " + inspectables.map {|i| "@#{i}=\"#{send(i)}\""}.join(' ') + ">"
22
+ "#<#{self.class}:0x#{object_id.to_s(16)} " + inspectables.map {|i| "@#{i}=\"#{send(i) rescue nil}\""}.join(' ') + ">"
23
23
  else
24
24
  super
25
25
  end
@@ -1,6 +1,3 @@
1
- require 'singleton'
2
- require 'open-uri'
3
-
4
1
  module Pollex
5
2
  # Singleton object for scraping Pollex, caching the results, and extracting data.
6
3
  class Scraper
@@ -28,10 +28,19 @@ module Pollex
28
28
  ])
29
29
  end
30
30
 
31
- # Counts the number of SemanticField within Pollex
32
- # @return [Integer] number of SemanticField in Pollex
31
+ # Counts the number of SemanticFields within Pollex
32
+ # @return [Integer] number of SemanticFields in Pollex
33
33
  def self.count
34
34
  self.all.count
35
35
  end
36
+
37
+ # Looks up all SemanticFields matching a given name.
38
+ # @note Pollex has no built-in search for SemanticFields, so this method is
39
+ # simply a filter over SemanticField.all.
40
+ # @param name [String] term to search for
41
+ # @return [Array<SemanticField>] array of SemanticFields matching the search term
42
+ def self.find(name)
43
+ self.all.select { |sf| sf.name.downcase.include?(name.downcase) }
44
+ end
36
45
  end
37
46
  end
@@ -13,6 +13,8 @@ module Pollex
13
13
  @entries ||= Scraper.instance.get_all(Entry, @path, [
14
14
  [:language_name, 'td[1]/a/text()'],
15
15
  [:language_path, 'td[1]/a/@href'],
16
+ [:source_code, nil, lambda {|x| @code}],
17
+ [:source_path, nil, lambda {|x| @path}],
16
18
  [:reflex, 'td[2]/text()'],
17
19
  [:description, 'td[3]/text()'],
18
20
  [:flag, "td[3]/span[@class='flag']/text()"]
@@ -38,6 +40,76 @@ module Pollex
38
40
  @count ||= @entries.count
39
41
  end
40
42
 
43
+ # Returns grammatical information for this source, used for
44
+ # intelligently parsing the descriptions of entries from this source
45
+ # @note Information is currently entered for all sources on
46
+ # http://pollex.org.nz/source/ up to (and including)
47
+ # Bse
48
+ # @return [Hash] grammatical information pertaining to the descriptions
49
+ # of this sources' entries
50
+ # @see Entry#terms
51
+ def grammar
52
+ # first, assume reasonable defaults
53
+
54
+ language = 'en' # default language: English
55
+ dividers = /[,;]/ # default: split on comma and semicolon
56
+ trim_expressions = '' # default: don't trim any expressions
57
+ trim_after = nil # default: don't trim any trailing text
58
+
59
+ # now bring in source-specific information
60
+
61
+ if ['Cnt', 'Bxn'].include? @code
62
+ # Spanish-language sources
63
+ language = 'es'
64
+ elsif ['Aca', 'Bgn', 'Btn', 'Hmn', 'Rch'].include? @code
65
+ # French-language sources
66
+ language = 'fr'
67
+ end
68
+
69
+ if ['Aca', 'Bxn'].include? @code
70
+ # split by comma, semicolon, period
71
+ dividers = /(,|;|\. )/
72
+ elsif ['Atn', 'Bwh', 'Hmn'].include? @code
73
+ # don't split at all
74
+ dividers = '\n' # dividers = nil doesn't work
75
+ elsif ['Bgn', 'Bst', 'Brn'].include? @code
76
+ # split by period
77
+ dividers = '.'
78
+ elsif ['Bkr', 'Bgs'].include? @code
79
+ # split by comma, period
80
+ dividers = /(,|\. )/
81
+ elsif ['Bge', 'Bck'].include? @code
82
+ # split by semicolon
83
+ dividers = ';'
84
+ end
85
+
86
+ if ['McP', 'Dsn'].include? @code
87
+ # Trim all (parenthetical expressions)
88
+ trim_expressions = /\(.*\)/
89
+ elsif ['Cnt', 'Aca', 'Bse', 'Hmn'].include? @code
90
+ # Trim parenthetical expressions that are <= 4 chars or contain numbers
91
+ trim_expressions = /\((.{0,4}|.*[0-9].*)\)/
92
+ elsif ['Stz', 'Bck'].include? @code
93
+ # Trim parenthetical expressions that contain numbers
94
+ trim_expressions = /\(.*[0-9].*\)/
95
+ elsif ['Rsr'].include? @code
96
+ # Trim all "expressions in quotes"
97
+ trim_expressions = /".*"/
98
+ end
99
+
100
+ if ['Btl', 'Bck'].include? @code
101
+ # Trim everything after a period
102
+ trim_after = '.'
103
+ end
104
+
105
+ {
106
+ :language => language,
107
+ :dividers => dividers,
108
+ :trim_expressions => trim_expressions,
109
+ :trim_after => trim_after
110
+ }
111
+ end
112
+
41
113
  # Returns all Sources in Pollex.
42
114
  # @return [Array<Source>] array of Sources in Pollex
43
115
  def self.all
@@ -49,5 +121,11 @@ module Pollex
49
121
  [:reference, 'td[4]/text()']
50
122
  ])
51
123
  end
124
+
125
+ # Counts the number of Sources within Pollex
126
+ # @return [Integer] number of Sources in Pollex
127
+ def self.count
128
+ self.all.count
129
+ end
52
130
  end
53
131
  end
@@ -0,0 +1,49 @@
1
+ module Pollex
2
+ # Singleton object for translating descriptions into English.
3
+ class Translator
4
+ include Singleton
5
+
6
+ # Instantiates a cache of size 100 for storing translations.
7
+ def initialize()
8
+ @cache = LRUCache.new(:max_size => 2500, :default => nil)
9
+ end
10
+
11
+ # Translates a phrase into English using the free MyMemory API, and caches
12
+ # the result.
13
+ # @note MyMemory currently has a limit of 100 API requests per IP per day for
14
+ # unregistered users.
15
+ # @param phrase [String] Phrase to be translated
16
+ # @param source_lang_code [String] Two-letter language code for the source language
17
+ # @param context [Array<String>] Adjoining phrases (optional)
18
+ # @result [String] Translated phrase
19
+ def translate(phrase, source_lang_code, context = nil)
20
+ context ||= [phrase]
21
+ if context.all? {|x| CLD.detect_language(x)[:code] == 'en'}
22
+ # we are reasonably sure that this phrase is already in English - no need to translate
23
+ phrase
24
+ else
25
+ # first, check the cache
26
+ key = [phrase, source_lang_code]
27
+ if @cache[key]
28
+ @cache[key]
29
+ else
30
+ # make a request to MyMemory
31
+ puts "Translating '#{phrase}' ..."
32
+ url = "http://mymemory.translated.net/api/get?q=#{URI::encode(phrase)}&langpair=#{source_lang_code}%7Cen"
33
+ results_json = open(url).read
34
+ result = JSON.parse(results_json)['responseData']['translatedText']
35
+
36
+ if result.include? 'MYMEMORY WARNING'
37
+ # translation failed - return original phrase
38
+ puts result
39
+ phrase
40
+ else
41
+ # translation succeeded - store into cache and return translated phrase
42
+ @cache[key] = result
43
+ result
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
@@ -1,3 +1,3 @@
1
1
  module Pollex
2
- VERSION = '0.0.3'
2
+ VERSION = '0.1.0'
3
3
  end
@@ -14,4 +14,6 @@ Gem::Specification.new do |s|
14
14
 
15
15
  s.add_dependency 'nokogiri'
16
16
  s.add_dependency 'lrucache'
17
+ s.add_dependency 'cld'
18
+ s.add_dependency 'json'
17
19
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pollex
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-03-06 00:00:00.000000000 Z
12
+ date: 2013-03-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -43,6 +43,38 @@ dependencies:
43
43
  - - ! '>='
44
44
  - !ruby/object:Gem::Version
45
45
  version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: cld
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: json
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
46
78
  description: Ruby wrapper for scraping pollex (the Polynesian Lexicon Project)
47
79
  email: alex.nisnevich@gmail.com
48
80
  executables: []
@@ -60,6 +92,7 @@ files:
60
92
  - lib/pollex/scraper.rb
61
93
  - lib/pollex/semantic_field.rb
62
94
  - lib/pollex/source.rb
95
+ - lib/pollex/translator.rb
63
96
  - lib/pollex/version.rb
64
97
  - pollex.gemspec
65
98
  homepage: http://github.com/AlexNisnevich/pollex
@@ -82,7 +115,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
82
115
  version: '0'
83
116
  requirements: []
84
117
  rubyforge_project:
85
- rubygems_version: 1.8.25
118
+ rubygems_version: 1.8.23
86
119
  signing_key:
87
120
  specification_version: 3
88
121
  summary: Ruby wrapper for scraping pollex (the Polynesian Lexicon Project)