pollex 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,5 @@
1
1
  module Pollex
2
+ # A Pollex entry, corresponding to a reflex for a reconstruction, with a language and a source.
2
3
  class Entry < PollexObject
3
4
  extend PollexClass
4
5
 
@@ -8,14 +9,19 @@ module Pollex
8
9
  attr_writer :source_code, :source_path
9
10
  attr_inspector :reflex, :description, :language_name, :source_code, :reconstruction_name, :flag
10
11
 
12
+ # @return [(String, nil)] the path to this entry, if given
13
+ # @note In some Pollex listings, entries' paths are not listed.
11
14
  def path
12
15
  @reconstruction_path
13
16
  end
14
17
 
18
+ # @return [Language] the Language corresponding to this entry
15
19
  def language
16
20
  @language ||= Language.new(:name => @language_name, :path => @language_path)
17
21
  end
18
22
 
23
+ # @return [(Source, nil)] the Source corresponding to this entry, if given
24
+ # @note In some Pollex listings, entries' sources are not listed.
19
25
  def source
20
26
  if @source_path
21
27
  @source ||= Source.new(:code => @source_code, :path => @source_path)
@@ -24,6 +30,8 @@ module Pollex
24
30
  end
25
31
  end
26
32
 
33
+ # @return [(Reconstruction, nil)] the Reconstruction corresponding to this entry, if given
34
+ # @note In some Pollex listings, entries' reconstructions are not listed.
27
35
  def reconstruction
28
36
  if @reconstruction_path
29
37
  @reconstruction ||= Reconstruction.new(:protoform => @reconstruction_name, :path => @reconstruction_path)
@@ -32,6 +40,9 @@ module Pollex
32
40
  end
33
41
  end
34
42
 
43
+ # Looks up all Entries matching a given name.
44
+ # @param name [String] term to search for
45
+ # @return [Array<Entry>] array of Entries matching the search term
35
46
  def self.find(name)
36
47
  Scraper.instance.get_all(Entry, "/search/?field=entry&query=#{name}", [
37
48
  [:reflex, 'td[3]/text()'],
@@ -1,4 +1,5 @@
1
1
  module Pollex
2
+ # A Polynesian language with entries in Pollex.
2
3
  class Language < PollexObject
3
4
  extend PollexClass
4
5
 
@@ -6,6 +7,8 @@ module Pollex
6
7
  attr_writer :code, :count
7
8
  attr_inspector :name, :code, :count, :path
8
9
 
10
+ # Returns all Entries belonging to this Language
11
+ # @return [Array<Entry>] array of Entries belonging to this Language
9
12
  def entries
10
13
  @entries ||= Scraper.instance.get_all(Entry, @path, [
11
14
  [:reflex, 'td[2]/text()'],
@@ -18,16 +21,20 @@ module Pollex
18
21
  ])
19
22
  end
20
23
 
24
+ # @return [String] the Language's abbreviated code
21
25
  def code
22
26
  @code ||= @path.split('/')[2].upcase
23
27
  end
24
28
 
29
+ # @return [Integer] number of Entries belonging to this Language
25
30
  def count
26
31
  @count ||= Scraper.instance.get(@path, [
27
32
  [:count, "p[@class='count']/text()", lambda {|x| x.split(' ').first}]
28
- ])[:count]
33
+ ])[:count].to_i
29
34
  end
30
35
 
36
+ # Returns all Languages in Pollex.
37
+ # @return [Array<Language>] array of Languages in Pollex
31
38
  def self.all
32
39
  @languages ||= Scraper.instance.get_all(Language, "/language/", [
33
40
  [:name, 'td[2]/a/text()'],
@@ -37,10 +44,15 @@ module Pollex
37
44
  ])
38
45
  end
39
46
 
47
+ # Counts the number of Languages within Pollex
48
+ # @return [Integer] number of Languages in Pollex
40
49
  def self.count
41
50
  self.all.count
42
51
  end
43
52
 
53
+ # Looks up all Languages matching a given name.
54
+ # @param name [String] term to search for
55
+ # @return [Array<Language>] array of Languages matching the search term
44
56
  def self.find(name)
45
57
  Scraper.instance.get_all(Language, "/search/?field=language&query=#{name}", [
46
58
  [:name, 'td[1]/a/text()'],
@@ -1,4 +1,5 @@
1
1
  module Pollex
2
+ # A level to which protoforms are reconstructed within Pollex.
2
3
  class Level < PollexObject
3
4
  extend PollexClass
4
5
 
@@ -6,6 +7,8 @@ module Pollex
6
7
  attr_writer :subgroup, :count
7
8
  attr_inspector :token, :subgroup, :count, :path
8
9
 
10
+ # Returns all Reconstructions at this Level
11
+ # @return [Array<Reconstruction>] array of Reconstructions at this Level
9
12
  def reconstructions
10
13
  @reconstructions ||= Scraper.instance.get_all(Reconstruction, @path, [
11
14
  [:path, 'td[1]/a/@href'],
@@ -14,18 +17,22 @@ module Pollex
14
17
  ])
15
18
  end
16
19
 
20
+ # @return the full name of this Level
17
21
  def subgroup
18
22
  @subgroup ||= Scraper.instance.get(@path, [
19
23
  [:subgroup, 'h1/text()', lambda {|x| x.split(' - ')[1]}]
20
24
  ])[:subgroup]
21
25
  end
22
26
 
27
+ # @return [Integer] number of Reconstructions at this Level
23
28
  def count
24
29
  @count ||= Scraper.instance.get(@path, [
25
30
  [:count, "p[@class='count']/text()", lambda {|x| x.split(' ').first}]
26
31
  ])[:count]
27
32
  end
28
33
 
34
+ # Returns all Levels in Pollex.
35
+ # @return [Array<Level>] array of Levels in Pollex
29
36
  def self.all
30
37
  @levels ||= Scraper.instance.get_all(Source, "/level/", [
31
38
  [:token, 'td[1]/a/text()'],
@@ -35,6 +42,8 @@ module Pollex
35
42
  ])
36
43
  end
37
44
 
45
+ # Counts the number of Levels within Pollex
46
+ # @return [Integer] number of Levels in Pollex
38
47
  def self.count
39
48
  self.all.count
40
49
  end
@@ -1,7 +1,9 @@
1
1
  module Pollex
2
- # helper instance methods
2
+ # Superclass for Pollex objects providing helper instance methods.
3
3
  class PollexObject
4
- # taken from https://github.com/neweryankee/nextbus/blob/master/lib/instantiate_with_attrs.rb
4
+ # Initializes objects with a hash of attributes.
5
+ # @see https://github.com/neweryankee/nextbus/blob/master/lib/instantiate_with_attrs.rb
6
+ # @author neweryankee
5
7
  def initialize(attrs={})
6
8
  super()
7
9
  attrs.each do |name, value|
@@ -11,6 +13,9 @@ module Pollex
11
13
  self
12
14
  end
13
15
 
16
+ # Overrides <tt>Object#inspect</tt> to only show the attributes defined
17
+ # by <tt>PollexClass#attr_inspector</tt>.
18
+ # @see PollexClass#attr_inspector
14
19
  def inspect
15
20
  inspectables = self.class.inspectables
16
21
  if inspectables
@@ -21,10 +26,13 @@ module Pollex
21
26
  end
22
27
  end
23
28
 
24
- # helper class methods
29
+ # Provides helper class methods for Pollex classes.
25
30
  module PollexClass
26
31
  attr_reader :inspectables
27
32
 
33
+ # Defines the list of attributes whose values are displayed by <tt>PollexObject#inspect</tt>.
34
+ # @param *attrs [Array<Symbol>] array of attribute labels
35
+ # @see PollexObject#inspect
28
36
  def attr_inspector(*attrs)
29
37
  @inspectables = attrs
30
38
  end
@@ -1,10 +1,13 @@
1
1
  module Pollex
2
+ # A reconstructed protoform in Pollex.
2
3
  class Reconstruction < PollexObject
3
4
  extend PollexClass
4
5
 
5
6
  attr_accessor :path, :protoform, :description, :semantic_field
6
7
  attr_inspector :protoform, :description, :path
7
8
 
9
+ # Returns all Entries belonging to this Reconstruction
10
+ # @return [Array<Entry>] array of Entries belonging to this Reconstruction
8
11
  def entries
9
12
  @entries ||= Scraper.instance.get_all(Entry, @path, [
10
13
  [:reflex, 'td[2]/text()'],
@@ -19,12 +22,14 @@ module Pollex
19
22
  ], 1)
20
23
  end
21
24
 
25
+ # @return [String] the Reconstruction's description
22
26
  def description
23
27
  @description ||= Scraper.instance.get(@path, [
24
28
  [:description, "table[1]/tr[1]/td/text()"]
25
29
  ])[:description]
26
30
  end
27
31
 
32
+ # @return [Level] the Level corresponding to this Reconstruction
28
33
  def level
29
34
  unless @level
30
35
  level_parts = Scraper.instance.get(@path, [
@@ -36,18 +41,22 @@ module Pollex
36
41
  @level
37
42
  end
38
43
 
44
+ # @return [String] the Reconstruction's notes
39
45
  def notes
40
46
  @notes ||= Scraper.instance.get(@path, [
41
47
  [:notes, "table[1]/tr[3]/td/p/text()"]
42
48
  ])[:notes]
43
49
  end
44
50
 
51
+ # @return [Integer] number of Entries belonging to this Reconstruction
45
52
  def count
46
53
  @count ||= Scraper.instance.get(@path, [
47
54
  [:count, "p[@class='count']/text()", lambda {|x| x.split(' ').first}]
48
55
  ])[:count]
49
56
  end
50
57
 
58
+ # Returns all Reconstructions in Pollex.
59
+ # @return [Array<Reconstruction>] array of Reconstructions in Pollex
51
60
  def self.all
52
61
  @sources ||= Scraper.instance.get_all(Reconstruction, "/entry/", [
53
62
  [:path, 'td[2]/a/@href'],
@@ -56,12 +65,17 @@ module Pollex
56
65
  ])
57
66
  end
58
67
 
68
+ # Counts the number of Reconstruction within Pollex
69
+ # @return [Integer] number of Reconstruction in Pollex
59
70
  def self.count
60
71
  @count ||= Scraper.instance.get("/entry/", [
61
72
  [:count, "p[@class='count']/text()", lambda {|x| x.split(' ').first}]
62
73
  ])[:count]
63
74
  end
64
75
 
76
+ # Looks up all Reconstructions matching a given name.
77
+ # @param name [String] term to search for
78
+ # @return [Array<Reconstruction>] array of Reconstructions matching the search term
65
79
  def self.find(name)
66
80
  Scraper.instance.get_all(Reconstruction, "/search/?field=protoform&query=#{name}", [
67
81
  [:path, 'td[2]/a/@href'],
@@ -2,28 +2,53 @@ require 'singleton'
2
2
  require 'open-uri'
3
3
 
4
4
  module Pollex
5
+ # Singleton object for scraping Pollex, caching the results, and extracting data.
5
6
  class Scraper
6
7
  include Singleton
7
8
 
9
+ attr_accessor :verbose
10
+
11
+ # Instantiates a cache of size 100 for storing scraped pages.
8
12
  def initialize()
9
13
  @cache = LRUCache.new(:max_size => 100, :default => nil)
14
+ @verbose = false
10
15
  end
11
16
 
12
- def open_from_cache(path)
17
+ # Opens the given Pollex page, either by retrieving it from the cache
18
+ # or by making a request with Nokogiri and then storing it in the cache.
19
+ # @param path [String] relative path from <tt>http://pollex.org.nz</tt>
20
+ # @return [Nokogiri::HTML::Document] the requested page, parsed with Nokogiri
21
+ def open_with_cache(path)
13
22
  if @cache[path]
14
- puts "Opening cached contents of http://pollex.org.nz#{path} ..."
23
+ if @verbose
24
+ puts "Opening cached contents of http://pollex.org.nz#{path} ..."
25
+ end
15
26
  @cache[path]
16
27
  else
17
- puts "Connecting to http://pollex.org.nz#{path} ..."
28
+ if @verbose
29
+ puts "Connecting to http://pollex.org.nz#{path} ..."
30
+ end
18
31
  page = Nokogiri::HTML(open("http://pollex.org.nz#{path}"))
19
32
  @cache[path] = page
20
33
  page
21
34
  end
22
35
  end
23
36
 
24
- # gets arbitrary data from page by xpath, with optional post-processing
37
+ # Gets arbitrary data from a page, with optional post-processing.
38
+ # @param path [String] relative path from <tt>http://pollex.org.nz</tt>
39
+ # @param attr_infos [Array<Array<Symbol, String, (Proc, nil)>>] an array that,
40
+ # for each element to be scraped, contains an array of:
41
+ # * a key for the element
42
+ # * the XPath to the element, from the <tt>div#content</tt> tag of the page
43
+ # * (optionally) a Proc to be performed on the element's contents
44
+ # @return [Array<Symbol, String>] array of key-value pairs
45
+ # @example Return information about the level of a given reconstruction
46
+ # Scraper.instance.get(@reconstruction_path, [
47
+ # [:level_token, "table[1]/tr[2]/td/a/text()", lambda {|x| x.split(':')[0]}],
48
+ # [:level_path, "table[1]/tr[2]/td/a/@href"]
49
+ # ])
25
50
  def get(path, attr_infos)
26
- page = open_from_cache(path)
51
+ page = open_with_cache(path)
27
52
  contents = page.css('#content')
28
53
 
29
54
  attrs = {}
@@ -39,10 +64,32 @@ module Pollex
39
64
  attrs
40
65
  end
41
66
 
42
- # gets all elements from table by xpath, with optional post-processing
67
+ # Gets all elements from a table within a page, with optional post-processing.
68
+ # The results are returned as either an array of key-value pairs or as an
69
+ # array of objects, if a klass is specifed. If more than one page of results is
70
+ # found, the first page of results is returned as a PaginatedArray.
71
+ # @param klass [Class] (optional) class of objects to be instantiated
72
+ # @param path [String] relative path from <tt>http://pollex.org.nz</tt>
73
+ # @param attr_infos [Array<Array<Symbol, String, (Proc, nil)>>] an array that,
74
+ # for each element to be scraped, contains an array of:
75
+ # * a key for the element
76
+ # * the XPath to the element, from a given table
77
+ # * (optionally) a Proc to be performed on the element's contents
78
+ # @param table_num [Integer] the number of the table on the page to process
79
+ # (default: 0 - that is, the first table on the page)
80
+ # @return [Array<klass>] if one page of results was found
81
+ # @return [PaginatedArray<klass>] if multiple pages of results were found
82
+ # @return [Array<Array<Symbol, String>>] if no klass is specified
83
+ # @example Return an array of all SemanticFields in Pollex
84
+ # Scraper.instance.get_all(SemanticField, "/category/", [
85
+ # [:id, 'td[1]/a/text()'],
86
+ # [:path, 'td[1]/a/@href'],
87
+ # [:name, 'td[2]/a/text()'],
88
+ # [:count, 'td[3]/text()']
89
+ # ])
43
90
  def get_all(klass, path, attr_infos, table_num = 0)
44
91
  puts "Connecting to http://pollex.org.nz#{path} ..."
45
- page = Nokogiri::HTML(open("http://pollex.org.nz#{path}"))
92
+ page = open_with_cache(path)
46
93
 
47
94
  rows = page.css('table')[table_num].css('tr')
48
95
  objs = rows[1..-1].map do |row|
@@ -81,7 +128,7 @@ module Pollex
81
128
  end
82
129
  end
83
130
 
84
- # array with a pointer to the next page of results
131
+ # Array with an optional pointer to the next page of results
85
132
  class PaginatedArray < Array
86
133
  attr_accessor :next_page, :query
87
134
 
@@ -93,6 +140,9 @@ module Pollex
93
140
  str
94
141
  end
95
142
 
143
+ # Returns the next page of results, if one exists
144
+ # @return PaginatedArray<@query[:klass]>
145
+ # @see Scraper#get_all
96
146
  def more
97
147
  if @next_page
98
148
  Scraper.instance.get_all(query[:klass], @next_page, query[:attr_infos], query[:table_num])
@@ -1,10 +1,13 @@
1
1
  module Pollex
2
+ # A semantic class containing a list of Pollex reconstructed protoforms.
2
3
  class SemanticField < PollexObject
3
4
  extend PollexClass
4
5
 
5
6
  attr_accessor :id, :name, :path, :count
6
7
  attr_inspector :id, :name, :count, :path
7
8
 
9
+ # Returns all Reconstructions corresponding to this SemanticField
10
+ # @return [Array<Reconstruction>] array of Reconstructions corresponding to this SemanticField
8
11
  def reconstructions
9
12
  @reconstructions ||= Scraper.instance.get_all(Reconstruction, @path, [
10
13
  [:path, 'td[1]/a/@href'],
@@ -14,6 +17,8 @@ module Pollex
14
17
  ])
15
18
  end
16
19
 
20
+ # Returns all SemanticFields in Pollex.
21
+ # @return [Array<SemanticField>] array of SemanticFields in Pollex
17
22
  def self.all
18
23
  @semantic_fields ||= Scraper.instance.get_all(SemanticField, "/category/", [
19
24
  [:id, 'td[1]/a/text()'],
@@ -23,6 +28,8 @@ module Pollex
23
28
  ])
24
29
  end
25
30
 
31
+ # Counts the number of SemanticField within Pollex
32
+ # @return [Integer] number of SemanticField in Pollex
26
33
  def self.count
27
34
  self.all.count
28
35
  end
@@ -1,4 +1,5 @@
1
1
  module Pollex
2
+ # A source of entries in Pollex.
2
3
  class Source < PollexObject
3
4
  extend PollexClass
4
5
 
@@ -6,6 +7,8 @@ module Pollex
6
7
  attr_writer :name, :reference, :count
7
8
  attr_inspector :code, :name, :reference, :count, :path
8
9
 
10
+ # Returns all Entries belonging to this Source
11
+ # @return [Array<Entry>] array of Entries belonging to this Source
9
12
  def entries
10
13
  @entries ||= Scraper.instance.get_all(Entry, @path, [
11
14
  [:language_name, 'td[1]/a/text()'],
@@ -16,22 +19,27 @@ module Pollex
16
19
  ])
17
20
  end
18
21
 
22
+ # @return [String] full name of this Source
19
23
  def name
20
24
  @name ||= Scraper.instance.get(@path, [
21
25
  [:name, 'h1/text()', lambda {|x| x.match('Entries from (.*) in Pollex-Online')[1]}]
22
26
  ])[:name]
23
27
  end
24
28
 
29
+ # @return [String] reference information for this Source
25
30
  def reference
26
31
  @reference ||= Scraper.instance.get(@path, [
27
32
  [:name, "p[@class='ref']/text()"]
28
33
  ])[:name]
29
34
  end
30
35
 
36
+ # @return [Integer] number of Entries belonging to this Source
31
37
  def count
32
38
  @count ||= @entries.count
33
39
  end
34
40
 
41
+ # Returns all Sources in Pollex.
42
+ # @return [Array<Source>] array of Sources in Pollex
35
43
  def self.all
36
44
  @sources ||= Scraper.instance.get_all(Source, "/source/", [
37
45
  [:code, 'td[1]/a/text()'],
@@ -1,8 +1,8 @@
1
1
  Gem::Specification.new do |s|
2
2
  s.name = 'pollex'
3
- s.version = '0.0.1'
3
+ s.version = '0.0.2'
4
4
  s.date = '2013-03-04'
5
- s.summary = "Ruby API for scraping pollex (the Polynesian Lexicon Project)"
5
+ s.summary = "Ruby wrapper for scraping pollex (the Polynesian Lexicon Project)"
6
6
  s.description = ""
7
7
  s.authors = ["Alex Nisnevich"]
8
8
  s.email = 'alex.nisnevich@gmail.com'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pollex
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -58,7 +58,6 @@ files:
58
58
  - lib/pollex/scraper.rb
59
59
  - lib/pollex/semantic_field.rb
60
60
  - lib/pollex/source.rb
61
- - pollex-0.0.1.gem
62
61
  - pollex.gemspec
63
62
  homepage: http://github.com/AlexNisnevich/pollex
64
63
  licenses: []
@@ -80,8 +79,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
80
79
  version: '0'
81
80
  requirements: []
82
81
  rubyforge_project:
83
- rubygems_version: 1.8.23
82
+ rubygems_version: 1.8.25
84
83
  signing_key:
85
84
  specification_version: 3
86
- summary: Ruby API for scraping pollex (the Polynesian Lexicon Project)
85
+ summary: Ruby wrapper for scraping pollex (the Polynesian Lexicon Project)
87
86
  test_files: []
87
+ has_rdoc: