bookshark 1.0.0.alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,55 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+ require 'fileutils'
5
+
6
+ require File.expand_path(File.join(File.dirname(__FILE__), '../extractors', 'base'))
7
+ # page = Nokogiri::HTML(open("raw_html_pages/book_45454.html"))
8
+ # puts page.class # => Nokogiri::HTML::Document
9
+ # puts page
10
+
11
+ FOLDER = 'html_book_pages'
12
+ BASE_URL = 'http://www.biblionet.gr/book/'
13
+ EXTENSION = '.html'
14
+
15
+ 301000.step(400000, 1000) do |last|
16
+ # saved_pages = 0
17
+ # empty_pages = 0
18
+
19
+ first = last - 1000 + 1
20
+ subfolder = (last/1000 - 1).to_s
21
+ path = "#{FOLDER}/#{subfolder}/"
22
+
23
+ # Create a new directory (does nothing if directory exists)
24
+ FileUtils.mkdir_p path
25
+
26
+ first.upto(last) do |id|
27
+ file_to_save = "#{path}book_#{id}#{EXTENSION}"
28
+ url_to_download = "#{BASE_URL}#{id}/"
29
+
30
+ downloader = Biblionet::Core::Base.new(url_to_download)
31
+ downloader.save_page(file_to_save) unless downloader.page.nil?
32
+
33
+ # open(url_to_parse) do |uri|
34
+ # puts "Parsing page: #{url_to_parse}"
35
+ # page = uri.read.gsub(/\s+/, " ")
36
+ # # doc = Nokogiri::HTML(page)
37
+ # # body = doc.at('title').inner_html
38
+ # # puts body
39
+ # if page.include? "</body>"
40
+ # puts "Saving page: #{file_to_save}"
41
+ # open(file_to_save, "w") do |file|
42
+ # file.write(page)
43
+ # end
44
+ # saved_pages += 1
45
+ # else
46
+ # puts "Page #{file_to_save} seems to be empty..."
47
+ # empty_pages += 1
48
+ # end
49
+ # end
50
+ end
51
+
52
+ # puts "Saved Pages: #{saved_pages}"
53
+ # puts "Empty Pages: #{empty_pages}"
54
+
55
+ end
@@ -0,0 +1,55 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+ require 'fileutils'
5
+
6
+ require File.expand_path(File.join(File.dirname(__FILE__), '../extractors', 'base'))
7
+ # page = Nokogiri::HTML(open("raw_html_pages/book_45454.html"))
8
+ # puts page.class # => Nokogiri::HTML::Document
9
+ # puts page
10
+
11
+ FOLDER = 'html_dcc_pages'
12
+ BASE_URL = 'http://www.biblionet.gr/index/'
13
+ EXTENSION = '.html'
14
+
15
+ 1000.step(6000, 1000) do |last|
16
+ # saved_pages = 0
17
+ # empty_pages = 0
18
+
19
+ first = last - 1000 + 1
20
+ subfolder = (last/1000 - 1).to_s
21
+ path = "#{FOLDER}/#{subfolder}/"
22
+
23
+ # Create a new directory (does nothing if directory exists)
24
+ FileUtils.mkdir_p path
25
+
26
+ first.upto(last) do |id|
27
+ file_to_save = "#{path}dcc_#{id}#{EXTENSION}"
28
+ url_to_download = "#{BASE_URL}#{id}/"
29
+
30
+ downloader = Biblionet::Core::Base.new(url_to_download)
31
+ downloader.save_page(file_to_save) unless downloader.page.nil?
32
+
33
+ # open(url_to_parse) do |uri|
34
+ # puts "Parsing page: #{url_to_parse}"
35
+ # page = uri.read.gsub(/\s+/, " ")
36
+ # # doc = Nokogiri::HTML(page)
37
+ # # body = doc.at('title').inner_html
38
+ # # puts body
39
+ # if page.include? "</body>"
40
+ # puts "Saving page: #{file_to_save}"
41
+ # open(file_to_save, "w") do |file|
42
+ # file.write(page)
43
+ # end
44
+ # saved_pages += 1
45
+ # else
46
+ # puts "Page #{file_to_save} seems to be empty..."
47
+ # empty_pages += 1
48
+ # end
49
+ # end
50
+ end
51
+
52
+ # puts "Saved Pages: #{saved_pages}"
53
+ # puts "Empty Pages: #{empty_pages}"
54
+
55
+ end
@@ -0,0 +1,35 @@
1
+ require_relative 'base'
2
+
3
+ module Biblionet
4
+ module Crawlers
5
+
6
+ class PublisherCrawler < Base
7
+ def initialize(options = {})
8
+ options[:folder] ||= 'lib/bookshark/storage/html_publisher_pages'
9
+ options[:base_url] ||= 'http://www.biblionet.gr/com/'
10
+ options[:page_type] ||= 'publisher'
11
+ options[:extension] ||= '.html'
12
+ options[:start] ||= 1
13
+ options[:finish] ||= 800
14
+ options[:step] ||= 100
15
+ super(options)
16
+ end
17
+
18
+ def crawl_and_save
19
+ downloader = Extractors::Base.new
20
+
21
+ spider do |url_to_download, file_to_save|
22
+ downloader.load_page(url_to_download)
23
+
24
+ # Create a new directory (does nothing if directory exists)
25
+ path = File.dirname(file_to_save)
26
+ FileUtils.mkdir_p path unless File.directory?(path)
27
+
28
+ downloader.save_page(file_to_save) unless downloader.page.nil? or downloader.page.length < 1024
29
+
30
+ end
31
+ end
32
+ end
33
+
34
+ end
35
+ end
@@ -0,0 +1,116 @@
1
+ require_relative 'base'
2
+
3
+ module Biblionet
4
+ module Extractors
5
+
6
+ class AuthorExtractor < Base
7
+ attr_reader :author
8
+
9
+ def initialize(uri=nil)
10
+ super(uri)
11
+ extract_author unless uri.nil? or @page.nil?
12
+ end
13
+
14
+
15
+ def load_and_extract_author(uri=nil)
16
+ load_page(uri)
17
+ extract_author unless uri.nil? or @page.nil?
18
+ end
19
+
20
+ # def to_json_pretty
21
+ # JSON.pretty_generate(@author) unless @author.nil?
22
+ # end
23
+
24
+ def extract_author(biblionet_id=@biblionet_id, author_page=@page)
25
+ puts "Extracting author: #{biblionet_id}"
26
+ page = AuthorDataExtractor.new(author_page)
27
+
28
+ identity = split_name(page.fullname)
29
+
30
+ author_hash = {}
31
+ if present?(identity[:lastname]) and present?(identity[:firstname])
32
+ author_hash[:name] = identity[:lastname] + ', ' + identity[:firstname]
33
+ elsif
34
+ author_hash[:name] = identity[:lastname]
35
+ end
36
+ author_hash[:firstname] = identity[:firstname]
37
+ author_hash[:lastname] = identity[:lastname]
38
+ author_hash[:lifetime] = identity[:lifetime]
39
+ author_hash[:image] = page.image
40
+ author_hash[:bio] = page.bio
41
+ author_hash[:award] = page.awards
42
+ author_hash[:b_id] = biblionet_id
43
+
44
+ # puts JSON.pretty_generate(author_hash)
45
+
46
+ return @author = author_hash
47
+ end
48
+
49
+ def split_name(fullname)
50
+ #mathes digits-digits or digits- in text like: Tolkien, John Ronald Reuel, 1892-1973
51
+ years_re = /\d+-\d*/
52
+
53
+ parts = fullname.split(',').map(&:strip)
54
+
55
+ identity = {}
56
+ identity[:lastname] = parts[0]
57
+
58
+ if parts.length == 2
59
+ if parts[1] =~ years_re
60
+ identity[:lifetime] = parts[1]
61
+ else
62
+ identity[:firstname] = parts[1]
63
+ end
64
+ elsif parts.length == 3
65
+ identity[:firstname] = parts[1]
66
+ identity[:lifetime] = parts[2]
67
+ end
68
+
69
+ return identity
70
+
71
+ end
72
+
73
+ end
74
+
75
+ class AuthorDataExtractor
76
+ attr_reader :nodeset
77
+
78
+ def initialize(document)
79
+ # No need to operate on whole page. Just on part containing the content.
80
+ content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
81
+ if (content_re.match(document)).nil?
82
+ puts document
83
+ end
84
+ content = content_re.match(document)[0]
85
+
86
+ @nodeset = Nokogiri::HTML(content)
87
+ end
88
+
89
+ def fullname
90
+ @nodeset.css('h1.page_title').text
91
+ end
92
+
93
+ def bio
94
+ @nodeset.css('//p[align="justify"]').text
95
+ end
96
+
97
+ def image
98
+ img_node = @nodeset.xpath("//img[@src[contains(.,'/persons/')]][1]")
99
+ img = (img_node.nil? or img_node.empty?) ? nil : BASE_URL+(img_node.first)['src']
100
+ return img
101
+ end
102
+
103
+ def awards
104
+ awards = []
105
+ @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'page=showaward') ]]").each do |item|
106
+ award = {name: item.text, year: item.next_sibling.text.strip.gsub(/[^\d]/, '')}
107
+ awards << award
108
+ end
109
+
110
+ return awards
111
+ end
112
+
113
+ end
114
+
115
+ end
116
+ end
@@ -0,0 +1,187 @@
1
+ require 'rubygems'
2
+ require 'open-uri'
3
+ require 'fileutils'
4
+ require 'nokogiri'
5
+ require 'json'
6
+ require 'logger'
7
+ require 'pp'
8
+ require 'htmlentities'
9
+
10
+ require File.expand_path(File.join(File.dirname(__FILE__), '../storage', 'file_manager'))
11
+
12
+ module Biblionet
13
+ module Extractors
14
+
15
+ BASE_URL = "http://www.biblionet.gr"
16
+
17
+ class Base
18
+ include FileManager
19
+
20
+ attr_reader :filepath, :url, :biblionet_id, :page
21
+
22
+ # Initializes the Base class. Without arguments nothing happens. Otherwise loads a page by url or file.
23
+ #
24
+ # ==== Attributes
25
+ #
26
+ # * +uri+ - It can be a url or a path/to/file.ext on local storage.
27
+ #
28
+ def initialize(uri=nil)
29
+ load_page(uri)
30
+ end
31
+
32
+ # Loads a page from the web or from local file storage depending on passed argument.
33
+ #
34
+ # ==== Attributes
35
+ #
36
+ # * +uri+ - It can be a url(starting with http/https) or a path/to/file.ext on local storage.
37
+ #
38
+ def load_page(uri=nil)
39
+ if uri.match(/\A#{URI::regexp(['http', 'https'])}\z/)
40
+ load_page_from_url(uri)
41
+ else
42
+ load_page_from_file(uri)
43
+ end unless uri.nil?
44
+ end
45
+
46
+ # Downloads a page from the web.
47
+ #
48
+ # ==== Attributes
49
+ #
50
+ # * +url+ - The url of webpage to download.
51
+ #
52
+ def load_page_from_url(url)
53
+ begin
54
+ @url = url
55
+ @biblionet_id = url[/\d+(?!.*\d+)/] unless url.nil? # id is expected to be the last number.
56
+
57
+ pp "Downloading page: #{url}"
58
+ open(url, :content_length_proc => lambda do |content_length|
59
+ raise EmptyPageError.new(url, content_length) unless content_length.nil? or content_length > 1024
60
+ end) do |f|
61
+ # pp f.status == ["200", "OK"] ? "success: #{f.status}" : f.status
62
+ # pp f.meta
63
+ # pp "Content-Type: " + f.content_type
64
+ # pp "Content-Size: " + (f.meta)["content-length"]
65
+ # pp "last modified" + f.last_modified.to_s + is_empty = (f.last_modified.nil?) ? 'Empty' : 'Not Empty'
66
+
67
+ @page = f.read.gsub(/\s+/, " ")
68
+ end
69
+ rescue Errno::ENOENT => e
70
+ pp "Page: #{url} NOT FOUND."
71
+ pp e
72
+ rescue EmptyPageError => e
73
+ pp "Page: #{url} is EMPTY."
74
+ pp e
75
+ @page = nil
76
+ rescue OpenURI::HTTPError => e
77
+ pp e
78
+ pp e.io.status
79
+ rescue StandardError => e
80
+ pp "Generic error #{e.class}. Will wait for 2 minutes and then try again."
81
+ pp e
82
+ sleep(120)
83
+ retry
84
+ end
85
+ end
86
+
87
+ # Reads a page from the local file system.
88
+ #
89
+ # ==== Attributes
90
+ #
91
+ # * +filepath+ - The path to target file which will be read.
92
+ #
93
+ def load_page_from_file(filepath)
94
+ begin
95
+ @filepath = filepath
96
+ @biblionet_id = filepath[/\d+(?!.*\d+)/] unless filepath.nil?
97
+ @page = open(filepath).read
98
+ rescue StandardError => e
99
+ puts e
100
+ end
101
+ end
102
+
103
+ # Attr writer method. Changes instance variables url, page and loads a new page by calling load_page_from_url
104
+ #
105
+ # ==== Attributes
106
+ #
107
+ # * +url+ - The new value of url instance var.
108
+ #
109
+ def url=(url)
110
+ load_page_from_url(url)
111
+ end
112
+
113
+ # Attr writer method. Changes instance variables filepath, page and loads a new page by calling load_page_from_filename
114
+ #
115
+ # ==== Attributes
116
+ #
117
+ # * +filepath+ - The path to target file which will be read.
118
+ #
119
+ def filepath=(filepath)
120
+ load_page_from_file(filepath)
121
+ end
122
+
123
+ # Saves page to file.
124
+ #
125
+ # ==== Attributes
126
+ #
127
+ # * +path+ - The path to file(including filename) where content will be saved.
128
+ #
129
+ def save_page(path)
130
+ save_to(path, @page)
131
+ pp "Saving page: #{path}"
132
+ end
133
+
134
+ # Decodes text with escaped html entities and returns the decoded text.
135
+ #
136
+ # ==== Params:
137
+ #
138
+ # +encoded_text+:: the text which contains encoded entities
139
+ #
140
+ def decode_text(encoded_text)
141
+ # encoded_text = File.read(encoded_file_path)
142
+ coder = HTMLEntities.new
143
+ coder.decode(encoded_text)
144
+ end
145
+
146
+ def present?(value)
147
+ return (not value.nil? and not value.empty?) ? true : false
148
+ end
149
+
150
+ end
151
+
152
+ # Raised when a page is considered empty.
153
+ #
154
+ class EmptyPageError < StandardError
155
+ attr_reader :url, :content_length
156
+
157
+ def initialize(url, content_length)
158
+ @url = url
159
+ @content_length = content_length
160
+
161
+ msg = "Page: #{url} is only #{content_length} bytes, so it is considered EMPTY."
162
+ super(msg)
163
+ end
164
+ end
165
+
166
+ # Raised when something unexpected or in wrong format is parsed.
167
+ #
168
+ class NoIdeaWhatThisIsError < StandardError
169
+ attr_reader :biblionet_id, :the_unexpected
170
+
171
+ def initialize(biblionet_id, the_unexpected)
172
+ @biblionet_id = biblionet_id
173
+ @the_unexpected = the_unexpected
174
+
175
+ msg = "We have no idea what this: #{the_unexpected} is. At book #{biblionet_id}"
176
+ super(msg)
177
+ end
178
+ end
179
+
180
+ end
181
+ end
182
+
183
+ # page = Parser::ParserBase.parse("http://www.biblionet.gr/book/300525")
184
+ # pp page.inspect
185
+ # Parser::ParserBase.parse("http://www.biblionet.gr/book/195243421")
186
+ # Parser::ParserBase.parse("dsdfds.com")
187
+ # Parser::ParserBase.save_to("book_195221.html", page)