pollex 0.0.3 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,15 @@
1
+ 3/6/2013 version 0.1.0
2
+ * New class: Pollex:Translator
3
+ * New methods:
4
+ * Entry.terms - parses the entry's description and returns an array
5
+ of English definitions for the entry
6
+ * Source.grammar - outputs information about parsing descriptions
7
+ of the source's entries
8
+ * SemanticField.find - filters SemanticField.all by name
9
+ * Including Pollex::Level
10
+ * Fixing Entry#inspect
11
+ * Error handling in PollexObject#inspect
12
+
1
13
  3/5/2013 version 0.0.3
2
14
  * Minor bug-fix, adding version and changelog
3
15
 
@@ -3,16 +3,24 @@ end
3
3
 
4
4
  require 'nokogiri'
5
5
  require 'lrucache'
6
+ require 'cld'
7
+ require 'json'
8
+
9
+ require 'singleton'
10
+ require 'open-uri'
11
+ require 'pp'
6
12
 
7
13
  [
8
14
  'version',
9
15
  'pollex_class',
10
16
  'scraper',
17
+ 'translator',
11
18
  'entry',
12
19
  'language',
13
20
  'reconstruction',
14
21
  'semantic_field',
15
- 'source'
22
+ 'source',
23
+ 'level'
16
24
  ].each do |file|
17
25
  require File.dirname(__FILE__) + "/pollex/#{file}.rb"
18
26
  end
@@ -3,7 +3,7 @@ module Pollex
3
3
  class Entry < PollexObject
4
4
  extend PollexClass
5
5
 
6
- attr_accessor :reflex, :description, :flag
6
+ attr_accessor :reflex, :description, :language_name, :source_code, :reconstruction_name, :flag
7
7
  attr_writer :reconstruction_name, :reconstruction_path
8
8
  attr_writer :language_name, :language_path
9
9
  attr_writer :source_code, :source_path
@@ -15,9 +15,40 @@ module Pollex
15
15
  @reconstruction_path
16
16
  end
17
17
 
18
+ # Processes the description of this entry and extracts a lits of
19
+ # definitions, translated into English if necessary.
20
+ # @return [Array<String>] definitions corresponding to this entry
21
+ def terms
22
+ string = @description
23
+ grammar = description_grammar
24
+
25
+ # trim last part of description, if necessary
26
+ if grammar[:trim_after]
27
+ string = string.split(grammar[:trim_after])[0]
28
+ end
29
+
30
+ # split into terms, remove any unnecessary expressions
31
+ terms = string.split(grammar[:dividers])
32
+ .map {|t| t.sub(grammar[:trim_expressions], '')
33
+ .strip
34
+ .capitalize }
35
+ .select {|t| t.match(/\w/) }
36
+
37
+ # attempt to translate to English if necessary
38
+ if grammar[:language] != 'en'
39
+ terms.map! {|t| Translator.instance.translate(t, grammar[:language], terms) }
40
+ end
41
+
42
+ terms
43
+ end
44
+
18
45
  # @return [Language] the Language corresponding to this entry
19
46
  def language
20
- @language ||= Language.new(:name => @language_name, :path => @language_path)
47
+ if @language_path
48
+ @language ||= Language.new(:name => @language_name, :path => @language_path)
49
+ else
50
+ nil
51
+ end
21
52
  end
22
53
 
23
54
  # @return [(Source, nil)] the Source corresponding to this entry, if given
@@ -54,5 +85,17 @@ module Pollex
54
85
  [:flag, "td[3]/span[@class='flag']/text()"]
55
86
  ])
56
87
  end
88
+
89
+ private
90
+
91
+ # @return [Hash] grammatical information pertaining to the description of this
92
+ # entry, used by Entry#terms
93
+ def description_grammar
94
+ if source
95
+ source.grammar
96
+ else
97
+ Source.new.grammar
98
+ end
99
+ end
57
100
  end
58
101
  end
@@ -19,7 +19,7 @@ module Pollex
19
19
  def inspect
20
20
  inspectables = self.class.inspectables
21
21
  if inspectables
22
- "#<#{self.class}:0x#{object_id.to_s(16)} " + inspectables.map {|i| "@#{i}=\"#{send(i)}\""}.join(' ') + ">"
22
+ "#<#{self.class}:0x#{object_id.to_s(16)} " + inspectables.map {|i| "@#{i}=\"#{send(i) rescue nil}\""}.join(' ') + ">"
23
23
  else
24
24
  super
25
25
  end
@@ -1,6 +1,3 @@
1
- require 'singleton'
2
- require 'open-uri'
3
-
4
1
  module Pollex
5
2
  # Singleton object for scraping Pollex, caching the results, and extracting data.
6
3
  class Scraper
@@ -28,10 +28,19 @@ module Pollex
28
28
  ])
29
29
  end
30
30
 
31
- # Counts the number of SemanticField within Pollex
32
- # @return [Integer] number of SemanticField in Pollex
31
+ # Counts the number of SemanticFields within Pollex
32
+ # @return [Integer] number of SemanticFields in Pollex
33
33
  def self.count
34
34
  self.all.count
35
35
  end
36
+
37
+ # Looks up all SemanticFields matching a given name.
38
+ # @note Pollex has no built-in search for SemanticFields, so this method is
39
+ # simply a filter over SemanticField.all.
40
+ # @param name [String] term to search for
41
+ # @return [Array<SemanticField>] array of SemanticFields matching the search term
42
+ def self.find(name)
43
+ self.all.select { |sf| sf.name.downcase.include?(name.downcase) }
44
+ end
36
45
  end
37
46
  end
@@ -13,6 +13,8 @@ module Pollex
13
13
  @entries ||= Scraper.instance.get_all(Entry, @path, [
14
14
  [:language_name, 'td[1]/a/text()'],
15
15
  [:language_path, 'td[1]/a/@href'],
16
+ [:source_code, nil, lambda {|x| @code}],
17
+ [:source_path, nil, lambda {|x| @path}],
16
18
  [:reflex, 'td[2]/text()'],
17
19
  [:description, 'td[3]/text()'],
18
20
  [:flag, "td[3]/span[@class='flag']/text()"]
@@ -38,6 +40,76 @@ module Pollex
38
40
  @count ||= @entries.count
39
41
  end
40
42
 
43
+ # Returns grammatical information for this source, used for
44
+ # intelligently parsing the descriptions of entries from this source
45
+ # @note Information is currently entered for all sources on
46
+ # http://pollex.org.nz/source/ up to (and including)
47
+ # Bse
48
+ # @return [Hash] grammatical information pertaining to the descriptions
49
+ # of this sources' entries
50
+ # @see Entry#terms
51
+ def grammar
52
+ # first, assume reasonable defaults
53
+
54
+ language = 'en' # default language: English
55
+ dividers = /[,;]/ # default: split on comma and semicolon
56
+ trim_expressions = '' # default: don't trim any expressions
57
+ trim_after = nil # default: don't trim any trailing text
58
+
59
+ # now bring in source-specific information
60
+
61
+ if ['Cnt', 'Bxn'].include? @code
62
+ # Spanish-language sources
63
+ language = 'es'
64
+ elsif ['Aca', 'Bgn', 'Btn', 'Hmn', 'Rch'].include? @code
65
+ # French-language sources
66
+ language = 'fr'
67
+ end
68
+
69
+ if ['Aca', 'Bxn'].include? @code
70
+ # split by comma, semicolon, period
71
+ dividers = /(,|;|\. )/
72
+ elsif ['Atn', 'Bwh', 'Hmn'].include? @code
73
+ # don't split at all
74
+ dividers = '\n' # dividers = nil doesn't work
75
+ elsif ['Bgn', 'Bst', 'Brn'].include? @code
76
+ # split by period
77
+ dividers = '.'
78
+ elsif ['Bkr', 'Bgs'].include? @code
79
+ # split by comma, period
80
+ dividers = /(,|\. )/
81
+ elsif ['Bge', 'Bck'].include? @code
82
+ # split by semicolon
83
+ dividers = ';'
84
+ end
85
+
86
+ if ['McP', 'Dsn'].include? @code
87
+ # Trim all (parenthetical expressions)
88
+ trim_expressions = /\(.*\)/
89
+ elsif ['Cnt', 'Aca', 'Bse', 'Hmn'].include? @code
90
+ # Trim parenthetical expressions that are <= 4 chars or contain numbers
91
+ trim_expressions = /\((.{0,4}|.*[0-9].*)\)/
92
+ elsif ['Stz', 'Bck'].include? @code
93
+ # Trim parenthetical expressions that contain numbers
94
+ trim_expressions = /\(.*[0-9].*\)/
95
+ elsif ['Rsr'].include? @code
96
+ # Trim all "expressions in quotes"
97
+ trim_expressions = /".*"/
98
+ end
99
+
100
+ if ['Btl', 'Bck'].include? @code
101
+ # Trim everything after a period
102
+ trim_after = '.'
103
+ end
104
+
105
+ {
106
+ :language => language,
107
+ :dividers => dividers,
108
+ :trim_expressions => trim_expressions,
109
+ :trim_after => trim_after
110
+ }
111
+ end
112
+
41
113
  # Returns all Sources in Pollex.
42
114
  # @return [Array<Source>] array of Sources in Pollex
43
115
  def self.all
@@ -49,5 +121,11 @@ module Pollex
49
121
  [:reference, 'td[4]/text()']
50
122
  ])
51
123
  end
124
+
125
+ # Counts the number of Sources within Pollex
126
+ # @return [Integer] number of Sources in Pollex
127
+ def self.count
128
+ self.all.count
129
+ end
52
130
  end
53
131
  end
@@ -0,0 +1,49 @@
1
+ module Pollex
2
+ # Singleton object for translating descriptions into English.
3
+ class Translator
4
+ include Singleton
5
+
6
+ # Instantiates a cache of size 100 for storing translations.
7
+ def initialize()
8
+ @cache = LRUCache.new(:max_size => 2500, :default => nil)
9
+ end
10
+
11
+ # Translates a phrase into English using the free MyMemory API, and caches
12
+ # the result.
13
+ # @note MyMemory currently has a limit of 100 API requests per IP per day for
14
+ # unregistered users.
15
+ # @param phrase [String] Phrase to be translated
16
+ # @param source_lang_code [String] Two-letter language code for the source language
17
+ # @param context [Array<String>] Adjoining phrases (optional)
18
+ # @result [String] Translated phrase
19
+ def translate(phrase, source_lang_code, context = nil)
20
+ context ||= [phrase]
21
+ if context.all? {|x| CLD.detect_language(x)[:code] == 'en'}
22
+ # we are reasonably sure that this phrase is already in English - no need to translate
23
+ phrase
24
+ else
25
+ # first, check the cache
26
+ key = [phrase, source_lang_code]
27
+ if @cache[key]
28
+ @cache[key]
29
+ else
30
+ # make a request to MyMemory
31
+ puts "Translating '#{phrase}' ..."
32
+ url = "http://mymemory.translated.net/api/get?q=#{URI::encode(phrase)}&langpair=#{source_lang_code}%7Cen"
33
+ results_json = open(url).read
34
+ result = JSON.parse(results_json)['responseData']['translatedText']
35
+
36
+ if result.include? 'MYMEMORY WARNING'
37
+ # translation failed - return original phrase
38
+ puts result
39
+ phrase
40
+ else
41
+ # translation succeeded - store into cache and return translated phrase
42
+ @cache[key] = result
43
+ result
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
@@ -1,3 +1,3 @@
1
1
  module Pollex
2
- VERSION = '0.0.3'
2
+ VERSION = '0.1.0'
3
3
  end
@@ -14,4 +14,6 @@ Gem::Specification.new do |s|
14
14
 
15
15
  s.add_dependency 'nokogiri'
16
16
  s.add_dependency 'lrucache'
17
+ s.add_dependency 'cld'
18
+ s.add_dependency 'json'
17
19
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pollex
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-03-06 00:00:00.000000000 Z
12
+ date: 2013-03-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -43,6 +43,38 @@ dependencies:
43
43
  - - ! '>='
44
44
  - !ruby/object:Gem::Version
45
45
  version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: cld
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: json
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
46
78
  description: Ruby wrapper for scraping pollex (the Polynesian Lexicon Project)
47
79
  email: alex.nisnevich@gmail.com
48
80
  executables: []
@@ -60,6 +92,7 @@ files:
60
92
  - lib/pollex/scraper.rb
61
93
  - lib/pollex/semantic_field.rb
62
94
  - lib/pollex/source.rb
95
+ - lib/pollex/translator.rb
63
96
  - lib/pollex/version.rb
64
97
  - pollex.gemspec
65
98
  homepage: http://github.com/AlexNisnevich/pollex
@@ -82,7 +115,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
82
115
  version: '0'
83
116
  requirements: []
84
117
  rubyforge_project:
85
- rubygems_version: 1.8.25
118
+ rubygems_version: 1.8.23
86
119
  signing_key:
87
120
  specification_version: 3
88
121
  summary: Ruby wrapper for scraping pollex (the Polynesian Lexicon Project)