pollex 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,10 @@
1
+ module Pollex
2
+ end
3
+
4
+ require 'nokogiri'
5
+ require 'lrucache'
6
+
7
+ ['pollex_class', 'scraper', 'entry', 'language', 'reconstruction',
8
+ 'semantic_field', 'source'].each do |file|
9
+ require File.dirname(__FILE__) + "/pollex/#{file}.rb"
10
+ end
@@ -0,0 +1,47 @@
1
+ module Pollex
2
+ class Entry < PollexObject
3
+ extend PollexClass
4
+
5
+ attr_accessor :reflex, :description, :flag
6
+ attr_writer :reconstruction_name, :reconstruction_path
7
+ attr_writer :language_name, :language_path
8
+ attr_writer :source_code, :source_path
9
+ attr_inspector :reflex, :description, :language_name, :source_code, :reconstruction_name, :flag
10
+
11
+ def path
12
+ @reconstruction_path
13
+ end
14
+
15
+ def language
16
+ @language ||= Language.new(:name => @language_name, :path => @language_path)
17
+ end
18
+
19
+ def source
20
+ if @source_path
21
+ @source ||= Source.new(:code => @source_code, :path => @source_path)
22
+ else
23
+ nil
24
+ end
25
+ end
26
+
27
+ def reconstruction
28
+ if @reconstruction_path
29
+ @reconstruction ||= Reconstruction.new(:protoform => @reconstruction_name, :path => @reconstruction_path)
30
+ else
31
+ nil
32
+ end
33
+ end
34
+
35
+ def self.find(name)
36
+ Scraper.instance.get_all(Entry, "/search/?field=entry&query=#{name}", [
37
+ [:reflex, 'td[3]/text()'],
38
+ [:description, 'td[4]/text()'],
39
+ [:language_path, 'td[1]/a/@href'],
40
+ [:language_name, 'td[1]/a/text()'],
41
+ [:reconstruction_path, 'td[2]/a/@href'],
42
+ [:reconstruction_name, 'td[2]/a/text()', lambda {|x| x.split('.')[1..-1].join('.')}],
43
+ [:flag, "td[3]/span[@class='flag']/text()"]
44
+ ])
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,51 @@
1
+ module Pollex
2
+ class Language < PollexObject
3
+ extend PollexClass
4
+
5
+ attr_accessor :name, :path
6
+ attr_writer :code, :count
7
+ attr_inspector :name, :code, :count, :path
8
+
9
+ def entries
10
+ @entries ||= Scraper.instance.get_all(Entry, @path, [
11
+ [:reflex, 'td[2]/text()'],
12
+ [:description, 'td[3]/text()'],
13
+ [:language_name, nil, lambda {|x| @name}],
14
+ [:language_path, nil, lambda {|x| @path}],
15
+ [:source_code, 'td[4]/a/text()'],
16
+ [:source_path, 'td[4]/a/@href'],
17
+ [:flag, "td[3]/span[@class='flag']/text()"]
18
+ ])
19
+ end
20
+
21
+ def code
22
+ @code ||= @path.split('/')[2].upcase
23
+ end
24
+
25
+ def count
26
+ @count ||= Scraper.instance.get(@path, [
27
+ [:count, "p[@class='count']/text()", lambda {|x| x.split(' ').first}]
28
+ ])[:count]
29
+ end
30
+
31
+ def self.all
32
+ @languages ||= Scraper.instance.get_all(Language, "/language/", [
33
+ [:name, 'td[2]/a/text()'],
34
+ [:path, 'td[1]/a/@href'],
35
+ [:code, 'td[1]/a/text()'],
36
+ [:count, 'td[3]/text()']
37
+ ])
38
+ end
39
+
40
+ def self.count
41
+ self.all.count
42
+ end
43
+
44
+ def self.find(name)
45
+ Scraper.instance.get_all(Language, "/search/?field=language&query=#{name}", [
46
+ [:name, 'td[1]/a/text()'],
47
+ [:path, 'td[1]/a/@href']
48
+ ])
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,42 @@
1
+ module Pollex
2
+ class Level < PollexObject
3
+ extend PollexClass
4
+
5
+ attr_accessor :token, :path
6
+ attr_writer :subgroup, :count
7
+ attr_inspector :token, :subgroup, :count, :path
8
+
9
+ def reconstructions
10
+ @reconstructions ||= Scraper.instance.get_all(Reconstruction, @path, [
11
+ [:path, 'td[1]/a/@href'],
12
+ [:protoform, 'td[1]/a/text()'],
13
+ [:description, 'td[2]/text()']
14
+ ])
15
+ end
16
+
17
+ def subgroup
18
+ @subgroup ||= Scraper.instance.get(@path, [
19
+ [:subgroup, 'h1/text()', lambda {|x| x.split(' - ')[1]}]
20
+ ])[:subgroup]
21
+ end
22
+
23
+ def count
24
+ @count ||= Scraper.instance.get(@path, [
25
+ [:count, "p[@class='count']/text()", lambda {|x| x.split(' ').first}]
26
+ ])[:count]
27
+ end
28
+
29
+ def self.all
30
+ @levels ||= Scraper.instance.get_all(Source, "/level/", [
31
+ [:token, 'td[1]/a/text()'],
32
+ [:subgroup, 'td[2]/a/text()'],
33
+ [:path, 'td[2]/a/@href'],
34
+ [:count, 'td[3]/a/text()'],
35
+ ])
36
+ end
37
+
38
+ def self.count
39
+ self.all.count
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,32 @@
1
+ module Pollex
2
+ # helper instance methods
3
+ class PollexObject
4
+ # taken from https://github.com/neweryankee/nextbus/blob/master/lib/instantiate_with_attrs.rb
5
+ def initialize(attrs={})
6
+ super()
7
+ attrs.each do |name, value|
8
+ setter = "#{name.to_s}=".to_sym
9
+ self.send(setter, value) if self.respond_to?(setter)
10
+ end
11
+ self
12
+ end
13
+
14
+ def inspect
15
+ inspectables = self.class.inspectables
16
+ if inspectables
17
+ "#<#{self.class}:0x#{object_id.to_s(16)} " + inspectables.map {|i| "@#{i}=\"#{send(i)}\""}.join(' ') + ">"
18
+ else
19
+ super
20
+ end
21
+ end
22
+ end
23
+
24
+ # helper class methods
25
+ module PollexClass
26
+ attr_reader :inspectables
27
+
28
+ def attr_inspector(*attrs)
29
+ @inspectables = attrs
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,73 @@
1
+ module Pollex
2
+ class Reconstruction < PollexObject
3
+ extend PollexClass
4
+
5
+ attr_accessor :path, :protoform, :description, :semantic_field
6
+ attr_inspector :protoform, :description, :path
7
+
8
+ def entries
9
+ @entries ||= Scraper.instance.get_all(Entry, @path, [
10
+ [:reflex, 'td[2]/text()'],
11
+ [:description, 'td[3]/text()'],
12
+ [:language_name, 'td[1]/a/text()'],
13
+ [:language_path, 'td[1]/a/@href'],
14
+ [:source_code, 'td[4]/a/text()'],
15
+ [:source_path, 'td[4]/a/@href'],
16
+ [:reconstruction_name, nil, lambda {|x| @protoform}],
17
+ [:reconstruction_path, nil, lambda {|x| @path}],
18
+ [:flag, "td[3]/span[@class='flag']/text()"]
19
+ ], 1)
20
+ end
21
+
22
+ def description
23
+ @description ||= Scraper.instance.get(@path, [
24
+ [:description, "table[1]/tr[1]/td/text()"]
25
+ ])[:description]
26
+ end
27
+
28
+ def level
29
+ unless @level
30
+ level_parts = Scraper.instance.get(@path, [
31
+ [:token, "table[1]/tr[2]/td/a/text()", lambda {|x| x.split(':')[0]}],
32
+ [:path, "table[1]/tr[2]/td/a/@href"]
33
+ ])
34
+ @level = Level.new(:token => level_parts[:token], :path => level_parts[:path])
35
+ end
36
+ @level
37
+ end
38
+
39
+ def notes
40
+ @notes ||= Scraper.instance.get(@path, [
41
+ [:notes, "table[1]/tr[3]/td/p/text()"]
42
+ ])[:notes]
43
+ end
44
+
45
+ def count
46
+ @count ||= Scraper.instance.get(@path, [
47
+ [:count, "p[@class='count']/text()", lambda {|x| x.split(' ').first}]
48
+ ])[:count]
49
+ end
50
+
51
+ def self.all
52
+ @sources ||= Scraper.instance.get_all(Reconstruction, "/entry/", [
53
+ [:path, 'td[2]/a/@href'],
54
+ [:protoform, 'td[2]/a/text()'],
55
+ [:description, 'td[3]/text()']
56
+ ])
57
+ end
58
+
59
+ def self.count
60
+ @count ||= Scraper.instance.get("/entry/", [
61
+ [:count, "p[@class='count']/text()", lambda {|x| x.split(' ').first}]
62
+ ])[:count]
63
+ end
64
+
65
+ def self.find(name)
66
+ Scraper.instance.get_all(Reconstruction, "/search/?field=protoform&query=#{name}", [
67
+ [:path, 'td[2]/a/@href'],
68
+ [:protoform, 'td[2]/a/text()'],
69
+ [:description, 'td[3]/text()']
70
+ ])
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,104 @@
1
+ require 'singleton'
2
+ require 'open-uri'
3
+
4
+ module Pollex
5
+ class Scraper
6
+ include Singleton
7
+
8
+ def initialize()
9
+ @cache = LRUCache.new(:max_size => 100, :default => nil)
10
+ end
11
+
12
+ def open_from_cache(path)
13
+ if @cache[path]
14
+ puts "Opening cached contents of http://pollex.org.nz#{path} ..."
15
+ @cache[path]
16
+ else
17
+ puts "Connecting to http://pollex.org.nz#{path} ..."
18
+ page = Nokogiri::HTML(open("http://pollex.org.nz#{path}"))
19
+ @cache[path] = page
20
+ page
21
+ end
22
+ end
23
+
24
+ # gets arbitrary data from page by xpath, with optional post-processing
25
+ def get(path, attr_infos)
26
+ page = open_from_cache(path)
27
+ contents = page.css('#content')
28
+
29
+ attrs = {}
30
+ attr_infos.each do |name, xpath, post_processor|
31
+ attrs[name] = ''
32
+ if xpath
33
+ attrs[name] = contents.at_xpath(xpath).to_s.strip
34
+ end
35
+ if post_processor
36
+ attrs[name] = post_processor.call(attrs[name])
37
+ end
38
+ end
39
+ attrs
40
+ end
41
+
42
+ # gets all elements from table by xpath, with optional post-processing
43
+ def get_all(klass, path, attr_infos, table_num = 0)
44
+ puts "Connecting to http://pollex.org.nz#{path} ..."
45
+ page = Nokogiri::HTML(open("http://pollex.org.nz#{path}"))
46
+
47
+ rows = page.css('table')[table_num].css('tr')
48
+ objs = rows[1..-1].map do |row|
49
+ attrs = {}
50
+ attr_infos.each do |name, xpath, post_processor|
51
+ attrs[name] = ''
52
+ if xpath
53
+ attrs[name] = row.at_xpath(xpath).to_s.strip
54
+ end
55
+ if post_processor
56
+ attrs[name] = post_processor.call(attrs[name])
57
+ end
58
+ end
59
+ attrs
60
+ end
61
+
62
+ # check if there is a "next" page
63
+ last_link = page.css('.pagination a').last()
64
+ if last_link and last_link.text()[0..3] == 'Next'
65
+ last_link_path = last_link.attributes()['href']
66
+ new_path = path.split('?')[0] + last_link_path
67
+
68
+ results = PaginatedArray.new()
69
+ results.query = {:klass => klass, :attr_infos => attr_infos, :table_num => table_num}
70
+ results.next_page = new_path
71
+ results.concat(objs.to_a) # merge rather than create new array
72
+ else
73
+ results = objs
74
+ end
75
+
76
+ if klass
77
+ results.map! {|x| klass.new(x) }
78
+ end
79
+
80
+ results
81
+ end
82
+ end
83
+
84
+ # array with a pointer to the next page of results
85
+ class PaginatedArray < Array
86
+ attr_accessor :next_page, :query
87
+
88
+ def inspect
89
+ str = super.inspect
90
+ if @next_page
91
+ str += "\nThere are more items available at #{@next_page}. Use _.more to get them."
92
+ end
93
+ str
94
+ end
95
+
96
+ def more
97
+ if @next_page
98
+ Scraper.instance.get_all(query[:klass], @next_page, query[:attr_infos], query[:table_num])
99
+ else
100
+ nil
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,30 @@
1
+ module Pollex
2
+ class SemanticField < PollexObject
3
+ extend PollexClass
4
+
5
+ attr_accessor :id, :name, :path, :count
6
+ attr_inspector :id, :name, :count, :path
7
+
8
+ def reconstructions
9
+ @reconstructions ||= Scraper.instance.get_all(Reconstruction, @path, [
10
+ [:path, 'td[1]/a/@href'],
11
+ [:protoform, 'td[1]/a/text()'],
12
+ [:description, 'td[2]/text()'],
13
+ [:semantic_field, nil, lambda {|x| self}]
14
+ ])
15
+ end
16
+
17
+ def self.all
18
+ @semantic_fields ||= Scraper.instance.get_all(SemanticField, "/category/", [
19
+ [:id, 'td[1]/a/text()'],
20
+ [:path, 'td[1]/a/@href'],
21
+ [:name, 'td[2]/a/text()'],
22
+ [:count, 'td[3]/text()']
23
+ ])
24
+ end
25
+
26
+ def self.count
27
+ self.all.count
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,45 @@
1
+ module Pollex
2
+ class Source < PollexObject
3
+ extend PollexClass
4
+
5
+ attr_accessor :code, :path
6
+ attr_writer :name, :reference, :count
7
+ attr_inspector :code, :name, :reference, :count, :path
8
+
9
+ def entries
10
+ @entries ||= Scraper.instance.get_all(Entry, @path, [
11
+ [:language_name, 'td[1]/a/text()'],
12
+ [:language_path, 'td[1]/a/@href'],
13
+ [:reflex, 'td[2]/text()'],
14
+ [:description, 'td[3]/text()'],
15
+ [:flag, "td[3]/span[@class='flag']/text()"]
16
+ ])
17
+ end
18
+
19
+ def name
20
+ @name ||= Scraper.instance.get(@path, [
21
+ [:name, 'h1/text()', lambda {|x| x.match('Entries from (.*) in Pollex-Online')[1]}]
22
+ ])[:name]
23
+ end
24
+
25
+ def reference
26
+ @reference ||= Scraper.instance.get(@path, [
27
+ [:name, "p[@class='ref']/text()"]
28
+ ])[:name]
29
+ end
30
+
31
+ def count
32
+ @count ||= @entries.count
33
+ end
34
+
35
+ def self.all
36
+ @sources ||= Scraper.instance.get_all(Source, "/source/", [
37
+ [:code, 'td[1]/a/text()'],
38
+ [:path, 'td[1]/a/@href'],
39
+ [:name, 'td[2]/a/text()'],
40
+ [:count, 'td[3]/text()'],
41
+ [:reference, 'td[4]/text()']
42
+ ])
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,15 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'pollex'
3
+ s.version = '0.0.1'
4
+ s.date = '2013-03-04'
5
+ s.summary = "Ruby API for scraping pollex (the Polynesian Lexicon Project)"
6
+ s.description = ""
7
+ s.authors = ["Alex Nisnevich"]
8
+ s.email = 'alex.nisnevich@gmail.com'
9
+ s.homepage = 'http://github.com/AlexNisnevich/pollex'
10
+
11
+ s.files = `git ls-files`.split("\n")
12
+
13
+ s.add_dependency 'nokogiri'
14
+ s.add_dependency 'lrucache'
15
+ end
metadata ADDED
@@ -0,0 +1,87 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pollex
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Alex Nisnevich
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-03-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: lrucache
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ description: ''
47
+ email: alex.nisnevich@gmail.com
48
+ executables: []
49
+ extensions: []
50
+ extra_rdoc_files: []
51
+ files:
52
+ - lib/pollex.rb
53
+ - lib/pollex/entry.rb
54
+ - lib/pollex/language.rb
55
+ - lib/pollex/level.rb
56
+ - lib/pollex/pollex_class.rb
57
+ - lib/pollex/reconstruction.rb
58
+ - lib/pollex/scraper.rb
59
+ - lib/pollex/semantic_field.rb
60
+ - lib/pollex/source.rb
61
+ - pollex-0.0.1.gem
62
+ - pollex.gemspec
63
+ homepage: http://github.com/AlexNisnevich/pollex
64
+ licenses: []
65
+ post_install_message:
66
+ rdoc_options: []
67
+ require_paths:
68
+ - lib
69
+ required_ruby_version: !ruby/object:Gem::Requirement
70
+ none: false
71
+ requirements:
72
+ - - ! '>='
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ required_rubygems_version: !ruby/object:Gem::Requirement
76
+ none: false
77
+ requirements:
78
+ - - ! '>='
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ requirements: []
82
+ rubyforge_project:
83
+ rubygems_version: 1.8.23
84
+ signing_key:
85
+ specification_version: 3
86
+ summary: Ruby API for scraping pollex (the Polynesian Lexicon Project)
87
+ test_files: []