pollex 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ module Pollex
2
+ end
3
+
4
+ require 'nokogiri'
5
+ require 'lrucache'
6
+
7
+ ['pollex_class', 'scraper', 'entry', 'language', 'reconstruction',
8
+ 'semantic_field', 'source'].each do |file|
9
+ require File.dirname(__FILE__) + "/pollex/#{file}.rb"
10
+ end
@@ -0,0 +1,47 @@
1
+ module Pollex
2
+ class Entry < PollexObject
3
+ extend PollexClass
4
+
5
+ attr_accessor :reflex, :description, :flag
6
+ attr_writer :reconstruction_name, :reconstruction_path
7
+ attr_writer :language_name, :language_path
8
+ attr_writer :source_code, :source_path
9
+ attr_inspector :reflex, :description, :language_name, :source_code, :reconstruction_name, :flag
10
+
11
+ def path
12
+ @reconstruction_path
13
+ end
14
+
15
+ def language
16
+ @language ||= Language.new(:name => @language_name, :path => @language_path)
17
+ end
18
+
19
+ def source
20
+ if @source_path
21
+ @source ||= Source.new(:code => @source_code, :path => @source_path)
22
+ else
23
+ nil
24
+ end
25
+ end
26
+
27
+ def reconstruction
28
+ if @reconstruction_path
29
+ @reconstruction ||= Reconstruction.new(:protoform => @reconstruction_name, :path => @reconstruction_path)
30
+ else
31
+ nil
32
+ end
33
+ end
34
+
35
+ def self.find(name)
36
+ Scraper.instance.get_all(Entry, "/search/?field=entry&query=#{name}", [
37
+ [:reflex, 'td[3]/text()'],
38
+ [:description, 'td[4]/text()'],
39
+ [:language_path, 'td[1]/a/@href'],
40
+ [:language_name, 'td[1]/a/text()'],
41
+ [:reconstruction_path, 'td[2]/a/@href'],
42
+ [:reconstruction_name, 'td[2]/a/text()', lambda {|x| x.split('.')[1..-1].join('.')}],
43
+ [:flag, "td[3]/span[@class='flag']/text()"]
44
+ ])
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,51 @@
1
+ module Pollex
2
+ class Language < PollexObject
3
+ extend PollexClass
4
+
5
+ attr_accessor :name, :path
6
+ attr_writer :code, :count
7
+ attr_inspector :name, :code, :count, :path
8
+
9
+ def entries
10
+ @entries ||= Scraper.instance.get_all(Entry, @path, [
11
+ [:reflex, 'td[2]/text()'],
12
+ [:description, 'td[3]/text()'],
13
+ [:language_name, nil, lambda {|x| @name}],
14
+ [:language_path, nil, lambda {|x| @path}],
15
+ [:source_code, 'td[4]/a/text()'],
16
+ [:source_path, 'td[4]/a/@href'],
17
+ [:flag, "td[3]/span[@class='flag']/text()"]
18
+ ])
19
+ end
20
+
21
+ def code
22
+ @code ||= @path.split('/')[2].upcase
23
+ end
24
+
25
+ def count
26
+ @count ||= Scraper.instance.get(@path, [
27
+ [:count, "p[@class='count']/text()", lambda {|x| x.split(' ').first}]
28
+ ])[:count]
29
+ end
30
+
31
+ def self.all
32
+ @languages ||= Scraper.instance.get_all(Language, "/language/", [
33
+ [:name, 'td[2]/a/text()'],
34
+ [:path, 'td[1]/a/@href'],
35
+ [:code, 'td[1]/a/text()'],
36
+ [:count, 'td[3]/text()']
37
+ ])
38
+ end
39
+
40
+ def self.count
41
+ self.all.count
42
+ end
43
+
44
+ def self.find(name)
45
+ Scraper.instance.get_all(Language, "/search/?field=language&query=#{name}", [
46
+ [:name, 'td[1]/a/text()'],
47
+ [:path, 'td[1]/a/@href']
48
+ ])
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,42 @@
1
+ module Pollex
2
+ class Level < PollexObject
3
+ extend PollexClass
4
+
5
+ attr_accessor :token, :path
6
+ attr_writer :subgroup, :count
7
+ attr_inspector :token, :subgroup, :count, :path
8
+
9
+ def reconstructions
10
+ @reconstructions ||= Scraper.instance.get_all(Reconstruction, @path, [
11
+ [:path, 'td[1]/a/@href'],
12
+ [:protoform, 'td[1]/a/text()'],
13
+ [:description, 'td[2]/text()']
14
+ ])
15
+ end
16
+
17
+ def subgroup
18
+ @subgroup ||= Scraper.instance.get(@path, [
19
+ [:subgroup, 'h1/text()', lambda {|x| x.split(' - ')[1]}]
20
+ ])[:subgroup]
21
+ end
22
+
23
+ def count
24
+ @count ||= Scraper.instance.get(@path, [
25
+ [:count, "p[@class='count']/text()", lambda {|x| x.split(' ').first}]
26
+ ])[:count]
27
+ end
28
+
29
+ def self.all
30
+ @levels ||= Scraper.instance.get_all(Source, "/level/", [
31
+ [:token, 'td[1]/a/text()'],
32
+ [:subgroup, 'td[2]/a/text()'],
33
+ [:path, 'td[2]/a/@href'],
34
+ [:count, 'td[3]/a/text()'],
35
+ ])
36
+ end
37
+
38
+ def self.count
39
+ self.all.count
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,32 @@
1
+ module Pollex
2
+ # helper instance methods
3
+ class PollexObject
4
+ # taken from https://github.com/neweryankee/nextbus/blob/master/lib/instantiate_with_attrs.rb
5
+ def initialize(attrs={})
6
+ super()
7
+ attrs.each do |name, value|
8
+ setter = "#{name.to_s}=".to_sym
9
+ self.send(setter, value) if self.respond_to?(setter)
10
+ end
11
+ self
12
+ end
13
+
14
+ def inspect
15
+ inspectables = self.class.inspectables
16
+ if inspectables
17
+ "#<#{self.class}:0x#{object_id.to_s(16)} " + inspectables.map {|i| "@#{i}=\"#{send(i)}\""}.join(' ') + ">"
18
+ else
19
+ super
20
+ end
21
+ end
22
+ end
23
+
24
+ # helper class methods
25
+ module PollexClass
26
+ attr_reader :inspectables
27
+
28
+ def attr_inspector(*attrs)
29
+ @inspectables = attrs
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,73 @@
1
+ module Pollex
2
+ class Reconstruction < PollexObject
3
+ extend PollexClass
4
+
5
+ attr_accessor :path, :protoform, :description, :semantic_field
6
+ attr_inspector :protoform, :description, :path
7
+
8
+ def entries
9
+ @entries ||= Scraper.instance.get_all(Entry, @path, [
10
+ [:reflex, 'td[2]/text()'],
11
+ [:description, 'td[3]/text()'],
12
+ [:language_name, 'td[1]/a/text()'],
13
+ [:language_path, 'td[1]/a/@href'],
14
+ [:source_code, 'td[4]/a/text()'],
15
+ [:source_path, 'td[4]/a/@href'],
16
+ [:reconstruction_name, nil, lambda {|x| @protoform}],
17
+ [:reconstruction_path, nil, lambda {|x| @path}],
18
+ [:flag, "td[3]/span[@class='flag']/text()"]
19
+ ], 1)
20
+ end
21
+
22
+ def description
23
+ @description ||= Scraper.instance.get(@path, [
24
+ [:description, "table[1]/tr[1]/td/text()"]
25
+ ])[:description]
26
+ end
27
+
28
+ def level
29
+ unless @level
30
+ level_parts = Scraper.instance.get(@path, [
31
+ [:token, "table[1]/tr[2]/td/a/text()", lambda {|x| x.split(':')[0]}],
32
+ [:path, "table[1]/tr[2]/td/a/@href"]
33
+ ])
34
+ @level = Level.new(:token => level_parts[:token], :path => level_parts[:path])
35
+ end
36
+ @level
37
+ end
38
+
39
+ def notes
40
+ @notes ||= Scraper.instance.get(@path, [
41
+ [:notes, "table[1]/tr[3]/td/p/text()"]
42
+ ])[:notes]
43
+ end
44
+
45
+ def count
46
+ @count ||= Scraper.instance.get(@path, [
47
+ [:count, "p[@class='count']/text()", lambda {|x| x.split(' ').first}]
48
+ ])[:count]
49
+ end
50
+
51
+ def self.all
52
+ @sources ||= Scraper.instance.get_all(Reconstruction, "/entry/", [
53
+ [:path, 'td[2]/a/@href'],
54
+ [:protoform, 'td[2]/a/text()'],
55
+ [:description, 'td[3]/text()']
56
+ ])
57
+ end
58
+
59
+ def self.count
60
+ @count ||= Scraper.instance.get("/entry/", [
61
+ [:count, "p[@class='count']/text()", lambda {|x| x.split(' ').first}]
62
+ ])[:count]
63
+ end
64
+
65
+ def self.find(name)
66
+ Scraper.instance.get_all(Reconstruction, "/search/?field=protoform&query=#{name}", [
67
+ [:path, 'td[2]/a/@href'],
68
+ [:protoform, 'td[2]/a/text()'],
69
+ [:description, 'td[3]/text()']
70
+ ])
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,104 @@
1
+ require 'singleton'
2
+ require 'open-uri'
3
+
4
+ module Pollex
5
+ class Scraper
6
+ include Singleton
7
+
8
+ def initialize()
9
+ @cache = LRUCache.new(:max_size => 100, :default => nil)
10
+ end
11
+
12
+ def open_from_cache(path)
13
+ if @cache[path]
14
+ puts "Opening cached contents of http://pollex.org.nz#{path} ..."
15
+ @cache[path]
16
+ else
17
+ puts "Connecting to http://pollex.org.nz#{path} ..."
18
+ page = Nokogiri::HTML(open("http://pollex.org.nz#{path}"))
19
+ @cache[path] = page
20
+ page
21
+ end
22
+ end
23
+
24
+ # gets arbitrary data from page by xpath, with optional post-processing
25
+ def get(path, attr_infos)
26
+ page = open_from_cache(path)
27
+ contents = page.css('#content')
28
+
29
+ attrs = {}
30
+ attr_infos.each do |name, xpath, post_processor|
31
+ attrs[name] = ''
32
+ if xpath
33
+ attrs[name] = contents.at_xpath(xpath).to_s.strip
34
+ end
35
+ if post_processor
36
+ attrs[name] = post_processor.call(attrs[name])
37
+ end
38
+ end
39
+ attrs
40
+ end
41
+
42
+ # gets all elements from table by xpath, with optional post-processing
43
+ def get_all(klass, path, attr_infos, table_num = 0)
44
+ puts "Connecting to http://pollex.org.nz#{path} ..."
45
+ page = Nokogiri::HTML(open("http://pollex.org.nz#{path}"))
46
+
47
+ rows = page.css('table')[table_num].css('tr')
48
+ objs = rows[1..-1].map do |row|
49
+ attrs = {}
50
+ attr_infos.each do |name, xpath, post_processor|
51
+ attrs[name] = ''
52
+ if xpath
53
+ attrs[name] = row.at_xpath(xpath).to_s.strip
54
+ end
55
+ if post_processor
56
+ attrs[name] = post_processor.call(attrs[name])
57
+ end
58
+ end
59
+ attrs
60
+ end
61
+
62
+ # check if there is a "next" page
63
+ last_link = page.css('.pagination a').last()
64
+ if last_link and last_link.text()[0..3] == 'Next'
65
+ last_link_path = last_link.attributes()['href']
66
+ new_path = path.split('?')[0] + last_link_path
67
+
68
+ results = PaginatedArray.new()
69
+ results.query = {:klass => klass, :attr_infos => attr_infos, :table_num => table_num}
70
+ results.next_page = new_path
71
+ results.concat(objs.to_a) # merge rather than create new array
72
+ else
73
+ results = objs
74
+ end
75
+
76
+ if klass
77
+ results.map! {|x| klass.new(x) }
78
+ end
79
+
80
+ results
81
+ end
82
+ end
83
+
84
+ # array with a pointer to the next page of results
85
+ class PaginatedArray < Array
86
+ attr_accessor :next_page, :query
87
+
88
+ def inspect
89
+ str = super.inspect
90
+ if @next_page
91
+ str += "\nThere are more items available at #{@next_page}. Use _.more to get them."
92
+ end
93
+ str
94
+ end
95
+
96
+ def more
97
+ if @next_page
98
+ Scraper.instance.get_all(query[:klass], @next_page, query[:attr_infos], query[:table_num])
99
+ else
100
+ nil
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,30 @@
1
+ module Pollex
2
+ class SemanticField < PollexObject
3
+ extend PollexClass
4
+
5
+ attr_accessor :id, :name, :path, :count
6
+ attr_inspector :id, :name, :count, :path
7
+
8
+ def reconstructions
9
+ @reconstructions ||= Scraper.instance.get_all(Reconstruction, @path, [
10
+ [:path, 'td[1]/a/@href'],
11
+ [:protoform, 'td[1]/a/text()'],
12
+ [:description, 'td[2]/text()'],
13
+ [:semantic_field, nil, lambda {|x| self}]
14
+ ])
15
+ end
16
+
17
+ def self.all
18
+ @semantic_fields ||= Scraper.instance.get_all(SemanticField, "/category/", [
19
+ [:id, 'td[1]/a/text()'],
20
+ [:path, 'td[1]/a/@href'],
21
+ [:name, 'td[2]/a/text()'],
22
+ [:count, 'td[3]/text()']
23
+ ])
24
+ end
25
+
26
+ def self.count
27
+ self.all.count
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,45 @@
1
+ module Pollex
2
+ class Source < PollexObject
3
+ extend PollexClass
4
+
5
+ attr_accessor :code, :path
6
+ attr_writer :name, :reference, :count
7
+ attr_inspector :code, :name, :reference, :count, :path
8
+
9
+ def entries
10
+ @entries ||= Scraper.instance.get_all(Entry, @path, [
11
+ [:language_name, 'td[1]/a/text()'],
12
+ [:language_path, 'td[1]/a/@href'],
13
+ [:reflex, 'td[2]/text()'],
14
+ [:description, 'td[3]/text()'],
15
+ [:flag, "td[3]/span[@class='flag']/text()"]
16
+ ])
17
+ end
18
+
19
+ def name
20
+ @name ||= Scraper.instance.get(@path, [
21
+ [:name, 'h1/text()', lambda {|x| x.match('Entries from (.*) in Pollex-Online')[1]}]
22
+ ])[:name]
23
+ end
24
+
25
+ def reference
26
+ @reference ||= Scraper.instance.get(@path, [
27
+ [:name, "p[@class='ref']/text()"]
28
+ ])[:name]
29
+ end
30
+
31
+ def count
32
+ @count ||= @entries.count
33
+ end
34
+
35
+ def self.all
36
+ @sources ||= Scraper.instance.get_all(Source, "/source/", [
37
+ [:code, 'td[1]/a/text()'],
38
+ [:path, 'td[1]/a/@href'],
39
+ [:name, 'td[2]/a/text()'],
40
+ [:count, 'td[3]/text()'],
41
+ [:reference, 'td[4]/text()']
42
+ ])
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,15 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'pollex'
3
+ s.version = '0.0.1'
4
+ s.date = '2013-03-04'
5
+ s.summary = "Ruby API for scraping pollex (the Polynesian Lexicon Project)"
6
+ s.description = ""
7
+ s.authors = ["Alex Nisnevich"]
8
+ s.email = 'alex.nisnevich@gmail.com'
9
+ s.homepage = 'http://github.com/AlexNisnevich/pollex'
10
+
11
+ s.files = `git ls-files`.split("\n")
12
+
13
+ s.add_dependency 'nokogiri'
14
+ s.add_dependency 'lrucache'
15
+ end
metadata ADDED
@@ -0,0 +1,87 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pollex
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Alex Nisnevich
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-03-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: lrucache
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ description: ''
47
+ email: alex.nisnevich@gmail.com
48
+ executables: []
49
+ extensions: []
50
+ extra_rdoc_files: []
51
+ files:
52
+ - lib/pollex.rb
53
+ - lib/pollex/entry.rb
54
+ - lib/pollex/language.rb
55
+ - lib/pollex/level.rb
56
+ - lib/pollex/pollex_class.rb
57
+ - lib/pollex/reconstruction.rb
58
+ - lib/pollex/scraper.rb
59
+ - lib/pollex/semantic_field.rb
60
+ - lib/pollex/source.rb
61
+ - pollex-0.0.1.gem
62
+ - pollex.gemspec
63
+ homepage: http://github.com/AlexNisnevich/pollex
64
+ licenses: []
65
+ post_install_message:
66
+ rdoc_options: []
67
+ require_paths:
68
+ - lib
69
+ required_ruby_version: !ruby/object:Gem::Requirement
70
+ none: false
71
+ requirements:
72
+ - - ! '>='
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ required_rubygems_version: !ruby/object:Gem::Requirement
76
+ none: false
77
+ requirements:
78
+ - - ! '>='
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ requirements: []
82
+ rubyforge_project:
83
+ rubygems_version: 1.8.23
84
+ signing_key:
85
+ specification_version: 3
86
+ summary: Ruby API for scraping pollex (the Polynesian Lexicon Project)
87
+ test_files: []