pollex 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  module Pollex
2
+ # A Pollex entry, corresponding to a reflex for a reconstruction, with a language and a source.
2
3
  class Entry < PollexObject
3
4
  extend PollexClass
4
5
 
@@ -8,14 +9,19 @@ module Pollex
8
9
  attr_writer :source_code, :source_path
9
10
  attr_inspector :reflex, :description, :language_name, :source_code, :reconstruction_name, :flag
10
11
 
12
+ # @return [(String, nil)] the path to this entry, if given
13
+ # @note In some Pollex listings, entries' paths are not listed.
11
14
  def path
12
15
  @reconstruction_path
13
16
  end
14
17
 
18
+ # @return [Language] the Language corresponding to this entry
15
19
  def language
16
20
  @language ||= Language.new(:name => @language_name, :path => @language_path)
17
21
  end
18
22
 
23
+ # @return [(Source, nil)] the Source corresponding to this entry, if given
24
+ # @note In some Pollex listings, entries' sources are not listed.
19
25
  def source
20
26
  if @source_path
21
27
  @source ||= Source.new(:code => @source_code, :path => @source_path)
@@ -24,6 +30,8 @@ module Pollex
24
30
  end
25
31
  end
26
32
 
33
+ # @return [(Reconstruction, nil)] the Reconstruction corresponding to this entry, if given
34
+ # @note In some Pollex listings, entries' reconstructions are not listed.
27
35
  def reconstruction
28
36
  if @reconstruction_path
29
37
  @reconstruction ||= Reconstruction.new(:protoform => @reconstruction_name, :path => @reconstruction_path)
@@ -32,6 +40,9 @@ module Pollex
32
40
  end
33
41
  end
34
42
 
43
+ # Looks up all Entries matching a given name.
44
+ # @param name [String] term to search for
45
+ # @return [Array<Entry>] array of Entries matching the search term
35
46
  def self.find(name)
36
47
  Scraper.instance.get_all(Entry, "/search/?field=entry&query=#{name}", [
37
48
  [:reflex, 'td[3]/text()'],
@@ -1,4 +1,5 @@
1
1
  module Pollex
2
+ # A Polynesian language with entries in Pollex.
2
3
  class Language < PollexObject
3
4
  extend PollexClass
4
5
 
@@ -6,6 +7,8 @@ module Pollex
6
7
  attr_writer :code, :count
7
8
  attr_inspector :name, :code, :count, :path
8
9
 
10
+ # Returns all Entries belonging to this Language
11
+ # @return [Array<Entry>] array of Entries belonging to this Language
9
12
  def entries
10
13
  @entries ||= Scraper.instance.get_all(Entry, @path, [
11
14
  [:reflex, 'td[2]/text()'],
@@ -18,16 +21,20 @@ module Pollex
18
21
  ])
19
22
  end
20
23
 
24
+ # @return [String] the Language's abbreviated code
21
25
  def code
22
26
  @code ||= @path.split('/')[2].upcase
23
27
  end
24
28
 
29
+ # @return [Integer] number of Entries belonging to this Language
25
30
  def count
26
31
  @count ||= Scraper.instance.get(@path, [
27
32
  [:count, "p[@class='count']/text()", lambda {|x| x.split(' ').first}]
28
- ])[:count]
33
+ ])[:count].to_i
29
34
  end
30
35
 
36
+ # Returns all Languages in Pollex.
37
+ # @return [Array<Language>] array of Languages in Pollex
31
38
  def self.all
32
39
  @languages ||= Scraper.instance.get_all(Language, "/language/", [
33
40
  [:name, 'td[2]/a/text()'],
@@ -37,10 +44,15 @@ module Pollex
37
44
  ])
38
45
  end
39
46
 
47
+ # Counts the number of Languages within Pollex
48
+ # @return [Integer] number of Languages in Pollex
40
49
  def self.count
41
50
  self.all.count
42
51
  end
43
52
 
53
+ # Looks up all Languages matching a given name.
54
+ # @param name [String] term to search for
55
+ # @return [Array<Language>] array of Languages matching the search term
44
56
  def self.find(name)
45
57
  Scraper.instance.get_all(Language, "/search/?field=language&query=#{name}", [
46
58
  [:name, 'td[1]/a/text()'],
@@ -1,4 +1,5 @@
1
1
  module Pollex
2
+ # A level to which protoforms are reconstructed within Pollex.
2
3
  class Level < PollexObject
3
4
  extend PollexClass
4
5
 
@@ -6,6 +7,8 @@ module Pollex
6
7
  attr_writer :subgroup, :count
7
8
  attr_inspector :token, :subgroup, :count, :path
8
9
 
10
+ # Returns all Reconstructions at this Level
11
+ # @return [Array<Reconstruction>] array of Reconstructions at this Level
9
12
  def reconstructions
10
13
  @reconstructions ||= Scraper.instance.get_all(Reconstruction, @path, [
11
14
  [:path, 'td[1]/a/@href'],
@@ -14,18 +17,22 @@ module Pollex
14
17
  ])
15
18
  end
16
19
 
20
+ # @return the full name of this Level
17
21
  def subgroup
18
22
  @subgroup ||= Scraper.instance.get(@path, [
19
23
  [:subgroup, 'h1/text()', lambda {|x| x.split(' - ')[1]}]
20
24
  ])[:subgroup]
21
25
  end
22
26
 
27
+ # @return [Integer] number of Reconstructions at this Level
23
28
  def count
24
29
  @count ||= Scraper.instance.get(@path, [
25
30
  [:count, "p[@class='count']/text()", lambda {|x| x.split(' ').first}]
26
31
  ])[:count]
27
32
  end
28
33
 
34
+ # Returns all Levels in Pollex.
35
+ # @return [Array<Level>] array of Levels in Pollex
29
36
  def self.all
30
37
  @levels ||= Scraper.instance.get_all(Source, "/level/", [
31
38
  [:token, 'td[1]/a/text()'],
@@ -35,6 +42,8 @@ module Pollex
35
42
  ])
36
43
  end
37
44
 
45
+ # Counts the number of Levels within Pollex
46
+ # @return [Integer] number of Levels in Pollex
38
47
  def self.count
39
48
  self.all.count
40
49
  end
@@ -1,7 +1,9 @@
1
1
  module Pollex
2
- # helper instance methods
2
+ # Superclass for Pollex objects providing helper instance methods.
3
3
  class PollexObject
4
- # taken from https://github.com/neweryankee/nextbus/blob/master/lib/instantiate_with_attrs.rb
4
+ # Initializes objects with a hash of attributes.
5
+ # @see https://github.com/neweryankee/nextbus/blob/master/lib/instantiate_with_attrs.rb
6
+ # @author neweryankee
5
7
  def initialize(attrs={})
6
8
  super()
7
9
  attrs.each do |name, value|
@@ -11,6 +13,9 @@ module Pollex
11
13
  self
12
14
  end
13
15
 
16
+ # Overrides <tt>Object#inspect</tt> to only show the attributes defined
17
+ # by <tt>PollexClass#attr_inspector</tt>.
18
+ # @see PollexClass#attr_inspector
14
19
  def inspect
15
20
  inspectables = self.class.inspectables
16
21
  if inspectables
@@ -21,10 +26,13 @@ module Pollex
21
26
  end
22
27
  end
23
28
 
24
- # helper class methods
29
+ # Provides helper class methods for Pollex classes.
25
30
  module PollexClass
26
31
  attr_reader :inspectables
27
32
 
33
+ # Defines the list of attributes whose values are displayed by <tt>PollexObject#inspect</tt>.
34
+ # @param *attrs [Array<Symbol>] array of attribute labels
35
+ # @see PollexObject#inspect
28
36
  def attr_inspector(*attrs)
29
37
  @inspectables = attrs
30
38
  end
@@ -1,10 +1,13 @@
1
1
  module Pollex
2
+ # A reconstructed protoform in Pollex.
2
3
  class Reconstruction < PollexObject
3
4
  extend PollexClass
4
5
 
5
6
  attr_accessor :path, :protoform, :description, :semantic_field
6
7
  attr_inspector :protoform, :description, :path
7
8
 
9
+ # Returns all Entries belonging to this Reconstruction
10
+ # @return [Array<Entry>] array of Entries belonging to this Reconstruction
8
11
  def entries
9
12
  @entries ||= Scraper.instance.get_all(Entry, @path, [
10
13
  [:reflex, 'td[2]/text()'],
@@ -19,12 +22,14 @@ module Pollex
19
22
  ], 1)
20
23
  end
21
24
 
25
+ # @return [String] the Reconstruction's description
22
26
  def description
23
27
  @description ||= Scraper.instance.get(@path, [
24
28
  [:description, "table[1]/tr[1]/td/text()"]
25
29
  ])[:description]
26
30
  end
27
31
 
32
+ # @return [Level] the Level corresponding to this Reconstruction
28
33
  def level
29
34
  unless @level
30
35
  level_parts = Scraper.instance.get(@path, [
@@ -36,18 +41,22 @@ module Pollex
36
41
  @level
37
42
  end
38
43
 
44
+ # @return [String] the Reconstruction's notes
39
45
  def notes
40
46
  @notes ||= Scraper.instance.get(@path, [
41
47
  [:notes, "table[1]/tr[3]/td/p/text()"]
42
48
  ])[:notes]
43
49
  end
44
50
 
51
+ # @return [Integer] number of Entries belonging to this Reconstruction
45
52
  def count
46
53
  @count ||= Scraper.instance.get(@path, [
47
54
  [:count, "p[@class='count']/text()", lambda {|x| x.split(' ').first}]
48
55
  ])[:count]
49
56
  end
50
57
 
58
+ # Returns all Reconstructions in Pollex.
59
+ # @return [Array<Reconstruction>] array of Reconstructions in Pollex
51
60
  def self.all
52
61
  @sources ||= Scraper.instance.get_all(Reconstruction, "/entry/", [
53
62
  [:path, 'td[2]/a/@href'],
@@ -56,12 +65,17 @@ module Pollex
56
65
  ])
57
66
  end
58
67
 
68
+ # Counts the number of Reconstruction within Pollex
69
+ # @return [Integer] number of Reconstruction in Pollex
59
70
  def self.count
60
71
  @count ||= Scraper.instance.get("/entry/", [
61
72
  [:count, "p[@class='count']/text()", lambda {|x| x.split(' ').first}]
62
73
  ])[:count]
63
74
  end
64
75
 
76
+ # Looks up all Reconstructions matching a given name.
77
+ # @param name [String] term to search for
78
+ # @return [Array<Reconstruction>] array of Reconstructions matching the search term
65
79
  def self.find(name)
66
80
  Scraper.instance.get_all(Reconstruction, "/search/?field=protoform&query=#{name}", [
67
81
  [:path, 'td[2]/a/@href'],
@@ -2,28 +2,53 @@ require 'singleton'
2
2
  require 'open-uri'
3
3
 
4
4
  module Pollex
5
+ # Singleton object for scraping Pollex, caching the results, and extracting data.
5
6
  class Scraper
6
7
  include Singleton
7
8
 
9
+ attr_accessor :verbose
10
+
11
+ # Instantiates a cache of size 100 for storing scraped pages.
8
12
  def initialize()
9
13
  @cache = LRUCache.new(:max_size => 100, :default => nil)
14
+ @verbose = false
10
15
  end
11
16
 
12
- def open_from_cache(path)
17
+ # Opens the given Pollex page, either by retrieving it from the cache
18
+ # or by making a request with Nokogiri and then storing it in the cache.
19
+ # @param path [String] relative path from <tt>http://pollex.org.nz</tt>
20
+ # @return [Nokogiri::HTML::Document] the requested page, parsed with Nokogiri
21
+ def open_with_cache(path)
13
22
  if @cache[path]
14
- puts "Opening cached contents of http://pollex.org.nz#{path} ..."
23
+ if @verbose
24
+ puts "Opening cached contents of http://pollex.org.nz#{path} ..."
25
+ end
15
26
  @cache[path]
16
27
  else
17
- puts "Connecting to http://pollex.org.nz#{path} ..."
28
+ if @verbose
29
+ puts "Connecting to http://pollex.org.nz#{path} ..."
30
+ end
18
31
  page = Nokogiri::HTML(open("http://pollex.org.nz#{path}"))
19
32
  @cache[path] = page
20
33
  page
21
34
  end
22
35
  end
23
36
 
24
- # gets arbitrary data from page by xpath, with optional post-processing
37
+ # Gets arbitrary data from a page, with optional post-processing.
38
+ # @param path [String] relative path from <tt>http://pollex.org.nz</tt>
39
+ # @param attr_infos [Array<Array<Symbol, String, (Proc, nil)>>] an array that,
40
+ # for each element to be scraped, contains an array of:
41
+ # * a key for the element
42
+ # * the XPath to the element, from the <tt>div#content</tt> tag of the page
43
+ # * (optionally) a Proc to be performed on the element's contents
44
+ # @return [Array<Symbol, String>] array of key-value pairs
45
+ # @example Return information about the level of a given reconstruction
46
+ # Scraper.instance.get(@reconstruction_path, [
47
+ # [:level_token, "table[1]/tr[2]/td/a/text()", lambda {|x| x.split(':')[0]}],
48
+ # [:level_path, "table[1]/tr[2]/td/a/@href"]
49
+ # ])
25
50
  def get(path, attr_infos)
26
- page = open_from_cache(path)
51
+ page = open_with_cache(path)
27
52
  contents = page.css('#content')
28
53
 
29
54
  attrs = {}
@@ -39,10 +64,32 @@ module Pollex
39
64
  attrs
40
65
  end
41
66
 
42
- # gets all elements from table by xpath, with optional post-processing
67
+ # Gets all elements from a table within a page, with optional post-processing.
68
+ # The results are returned as either an array of key-value pairs or as an
69
+ # array of objects, if a klass is specifed. If more than one page of results is
70
+ # found, the first page of results is returned as a PaginatedArray.
71
+ # @param klass [Class] (optional) class of objects to be instantiated
72
+ # @param path [String] relative path from <tt>http://pollex.org.nz</tt>
73
+ # @param attr_infos [Array<Array<Symbol, String, (Proc, nil)>>] an array that,
74
+ # for each element to be scraped, contains an array of:
75
+ # * a key for the element
76
+ # * the XPath to the element, from a given table
77
+ # * (optionally) a Proc to be performed on the element's contents
78
+ # @param table_num [Integer] the number of the table on the page to process
79
+ # (default: 0 - that is, the first table on the page)
80
+ # @return [Array<klass>] if one page of results was found
81
+ # @return [PaginatedArray<klass>] if multiple pages of results were found
82
+ # @return [Array<Array<Symbol, String>>] if no klass is specified
83
+ # @example Return an array of all SemanticFields in Pollex
84
+ # Scraper.instance.get_all(SemanticField, "/category/", [
85
+ # [:id, 'td[1]/a/text()'],
86
+ # [:path, 'td[1]/a/@href'],
87
+ # [:name, 'td[2]/a/text()'],
88
+ # [:count, 'td[3]/text()']
89
+ # ])
43
90
  def get_all(klass, path, attr_infos, table_num = 0)
44
91
  puts "Connecting to http://pollex.org.nz#{path} ..."
45
- page = Nokogiri::HTML(open("http://pollex.org.nz#{path}"))
92
+ page = open_with_cache(path)
46
93
 
47
94
  rows = page.css('table')[table_num].css('tr')
48
95
  objs = rows[1..-1].map do |row|
@@ -81,7 +128,7 @@ module Pollex
81
128
  end
82
129
  end
83
130
 
84
- # array with a pointer to the next page of results
131
+ # Array with an optional pointer to the next page of results
85
132
  class PaginatedArray < Array
86
133
  attr_accessor :next_page, :query
87
134
 
@@ -93,6 +140,9 @@ module Pollex
93
140
  str
94
141
  end
95
142
 
143
+ # Returns the next page of results, if one exists
144
+ # @return PaginatedArray<@query[:klass]>
145
+ # @see Scraper#get_all
96
146
  def more
97
147
  if @next_page
98
148
  Scraper.instance.get_all(query[:klass], @next_page, query[:attr_infos], query[:table_num])
@@ -1,10 +1,13 @@
1
1
  module Pollex
2
+ # A semantic class containing a list of Pollex reconstructed protoforms.
2
3
  class SemanticField < PollexObject
3
4
  extend PollexClass
4
5
 
5
6
  attr_accessor :id, :name, :path, :count
6
7
  attr_inspector :id, :name, :count, :path
7
8
 
9
+ # Returns all Reconstructions corresponding to this SemanticField
10
+ # @return [Array<Reconstruction>] array of Reconstructions corresponding to this SemanticField
8
11
  def reconstructions
9
12
  @reconstructions ||= Scraper.instance.get_all(Reconstruction, @path, [
10
13
  [:path, 'td[1]/a/@href'],
@@ -14,6 +17,8 @@ module Pollex
14
17
  ])
15
18
  end
16
19
 
20
+ # Returns all SemanticFields in Pollex.
21
+ # @return [Array<SemanticField>] array of SemanticFields in Pollex
17
22
  def self.all
18
23
  @semantic_fields ||= Scraper.instance.get_all(SemanticField, "/category/", [
19
24
  [:id, 'td[1]/a/text()'],
@@ -23,6 +28,8 @@ module Pollex
23
28
  ])
24
29
  end
25
30
 
31
+ # Counts the number of SemanticField within Pollex
32
+ # @return [Integer] number of SemanticField in Pollex
26
33
  def self.count
27
34
  self.all.count
28
35
  end
@@ -1,4 +1,5 @@
1
1
  module Pollex
2
+ # A source of entries in Pollex.
2
3
  class Source < PollexObject
3
4
  extend PollexClass
4
5
 
@@ -6,6 +7,8 @@ module Pollex
6
7
  attr_writer :name, :reference, :count
7
8
  attr_inspector :code, :name, :reference, :count, :path
8
9
 
10
+ # Returns all Entries belonging to this Source
11
+ # @return [Array<Entry>] array of Entries belonging to this Source
9
12
  def entries
10
13
  @entries ||= Scraper.instance.get_all(Entry, @path, [
11
14
  [:language_name, 'td[1]/a/text()'],
@@ -16,22 +19,27 @@ module Pollex
16
19
  ])
17
20
  end
18
21
 
22
+ # @return [String] full name of this Source
19
23
  def name
20
24
  @name ||= Scraper.instance.get(@path, [
21
25
  [:name, 'h1/text()', lambda {|x| x.match('Entries from (.*) in Pollex-Online')[1]}]
22
26
  ])[:name]
23
27
  end
24
28
 
29
+ # @return [String] reference information for this Source
25
30
  def reference
26
31
  @reference ||= Scraper.instance.get(@path, [
27
32
  [:name, "p[@class='ref']/text()"]
28
33
  ])[:name]
29
34
  end
30
35
 
36
+ # @return [Integer] number of Entries belonging to this Source
31
37
  def count
32
38
  @count ||= @entries.count
33
39
  end
34
40
 
41
+ # Returns all Sources in Pollex.
42
+ # @return [Array<Source>] array of Sources in Pollex
35
43
  def self.all
36
44
  @sources ||= Scraper.instance.get_all(Source, "/source/", [
37
45
  [:code, 'td[1]/a/text()'],
@@ -1,8 +1,8 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'pollex'
3
- s.version = '0.0.1'
3
+ s.version = '0.0.2'
4
4
  s.date = '2013-03-04'
5
- s.summary = "Ruby API for scraping pollex (the Polynesian Lexicon Project)"
5
+ s.summary = "Ruby wrapper for scraping pollex (the Polynesian Lexicon Project)"
6
6
  s.description = ""
7
7
  s.authors = ["Alex Nisnevich"]
8
8
  s.email = 'alex.nisnevich@gmail.com'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pollex
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -58,7 +58,6 @@ files:
58
58
  - lib/pollex/scraper.rb
59
59
  - lib/pollex/semantic_field.rb
60
60
  - lib/pollex/source.rb
61
- - pollex-0.0.1.gem
62
61
  - pollex.gemspec
63
62
  homepage: http://github.com/AlexNisnevich/pollex
64
63
  licenses: []
@@ -80,8 +79,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
80
79
  version: '0'
81
80
  requirements: []
82
81
  rubyforge_project:
83
- rubygems_version: 1.8.23
82
+ rubygems_version: 1.8.25
84
83
  signing_key:
85
84
  specification_version: 3
86
- summary: Ruby API for scraping pollex (the Polynesian Lexicon Project)
85
+ summary: Ruby wrapper for scraping pollex (the Polynesian Lexicon Project)
87
86
  test_files: []
87
+ has_rdoc: