bookshark 1.0.0.alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +20 -0
- data/.rspec +3 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +453 -0
- data/Rakefile +4 -0
- data/bookshark.gemspec +29 -0
- data/lib/bookshark.rb +371 -0
- data/lib/bookshark/crawlers/author_crawler.rb +42 -0
- data/lib/bookshark/crawlers/base.rb +46 -0
- data/lib/bookshark/crawlers/book_crawler.rb +55 -0
- data/lib/bookshark/crawlers/category_crawler.rb +55 -0
- data/lib/bookshark/crawlers/publisher_crawler.rb +35 -0
- data/lib/bookshark/extractors/author_extractor.rb +116 -0
- data/lib/bookshark/extractors/base.rb +187 -0
- data/lib/bookshark/extractors/book_extractor.rb +453 -0
- data/lib/bookshark/extractors/category_extractor.rb +82 -0
- data/lib/bookshark/extractors/publisher_extractor.rb +138 -0
- data/lib/bookshark/extractors/search.rb +104 -0
- data/lib/bookshark/storage/file_manager.rb +103 -0
- data/lib/bookshark/version.rb +3 -0
- data/spec/bookshark_spec.rb +96 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/test_data/author_13219.html +313 -0
- data/spec/test_data/author_13219.json +23 -0
- data/spec/test_data/book_103788.json +49 -0
- data/spec/test_data/category_1041.json +42 -0
- data/spec/test_data/eager_book_184923.json +215 -0
- data/spec/test_data/publisher_20.json +43 -0
- data/spec/test_data/search_01.json +355 -0
- data/spec/test_data/search_ids_01.json +13 -0
- data/tasks/console.rake +4 -0
- data/tasks/rspec.rake +3 -0
- metadata +191 -0
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '../extractors', 'base'))
|
7
|
+
# page = Nokogiri::HTML(open("raw_html_pages/book_45454.html"))
|
8
|
+
# puts page.class # => Nokogiri::HTML::Document
|
9
|
+
# puts page
|
10
|
+
|
11
|
+
FOLDER = 'html_book_pages'
|
12
|
+
BASE_URL = 'http://www.biblionet.gr/book/'
|
13
|
+
EXTENSION = '.html'
|
14
|
+
|
15
|
+
301000.step(400000, 1000) do |last|
|
16
|
+
# saved_pages = 0
|
17
|
+
# empty_pages = 0
|
18
|
+
|
19
|
+
first = last - 1000 + 1
|
20
|
+
subfolder = (last/1000 - 1).to_s
|
21
|
+
path = "#{FOLDER}/#{subfolder}/"
|
22
|
+
|
23
|
+
# Create a new directory (does nothing if directory exists)
|
24
|
+
FileUtils.mkdir_p path
|
25
|
+
|
26
|
+
first.upto(last) do |id|
|
27
|
+
file_to_save = "#{path}book_#{id}#{EXTENSION}"
|
28
|
+
url_to_download = "#{BASE_URL}#{id}/"
|
29
|
+
|
30
|
+
downloader = Biblionet::Core::Base.new(url_to_download)
|
31
|
+
downloader.save_page(file_to_save) unless downloader.page.nil?
|
32
|
+
|
33
|
+
# open(url_to_parse) do |uri|
|
34
|
+
# puts "Parsing page: #{url_to_parse}"
|
35
|
+
# page = uri.read.gsub(/\s+/, " ")
|
36
|
+
# # doc = Nokogiri::HTML(page)
|
37
|
+
# # body = doc.at('title').inner_html
|
38
|
+
# # puts body
|
39
|
+
# if page.include? "</body>"
|
40
|
+
# puts "Saving page: #{file_to_save}"
|
41
|
+
# open(file_to_save, "w") do |file|
|
42
|
+
# file.write(page)
|
43
|
+
# end
|
44
|
+
# saved_pages += 1
|
45
|
+
# else
|
46
|
+
# puts "Page #{file_to_save} seems to be empty..."
|
47
|
+
# empty_pages += 1
|
48
|
+
# end
|
49
|
+
# end
|
50
|
+
end
|
51
|
+
|
52
|
+
# puts "Saved Pages: #{saved_pages}"
|
53
|
+
# puts "Empty Pages: #{empty_pages}"
|
54
|
+
|
55
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '../extractors', 'base'))
|
7
|
+
# page = Nokogiri::HTML(open("raw_html_pages/book_45454.html"))
|
8
|
+
# puts page.class # => Nokogiri::HTML::Document
|
9
|
+
# puts page
|
10
|
+
|
11
|
+
FOLDER = 'html_dcc_pages'
|
12
|
+
BASE_URL = 'http://www.biblionet.gr/index/'
|
13
|
+
EXTENSION = '.html'
|
14
|
+
|
15
|
+
1000.step(6000, 1000) do |last|
|
16
|
+
# saved_pages = 0
|
17
|
+
# empty_pages = 0
|
18
|
+
|
19
|
+
first = last - 1000 + 1
|
20
|
+
subfolder = (last/1000 - 1).to_s
|
21
|
+
path = "#{FOLDER}/#{subfolder}/"
|
22
|
+
|
23
|
+
# Create a new directory (does nothing if directory exists)
|
24
|
+
FileUtils.mkdir_p path
|
25
|
+
|
26
|
+
first.upto(last) do |id|
|
27
|
+
file_to_save = "#{path}dcc_#{id}#{EXTENSION}"
|
28
|
+
url_to_download = "#{BASE_URL}#{id}/"
|
29
|
+
|
30
|
+
downloader = Biblionet::Core::Base.new(url_to_download)
|
31
|
+
downloader.save_page(file_to_save) unless downloader.page.nil?
|
32
|
+
|
33
|
+
# open(url_to_parse) do |uri|
|
34
|
+
# puts "Parsing page: #{url_to_parse}"
|
35
|
+
# page = uri.read.gsub(/\s+/, " ")
|
36
|
+
# # doc = Nokogiri::HTML(page)
|
37
|
+
# # body = doc.at('title').inner_html
|
38
|
+
# # puts body
|
39
|
+
# if page.include? "</body>"
|
40
|
+
# puts "Saving page: #{file_to_save}"
|
41
|
+
# open(file_to_save, "w") do |file|
|
42
|
+
# file.write(page)
|
43
|
+
# end
|
44
|
+
# saved_pages += 1
|
45
|
+
# else
|
46
|
+
# puts "Page #{file_to_save} seems to be empty..."
|
47
|
+
# empty_pages += 1
|
48
|
+
# end
|
49
|
+
# end
|
50
|
+
end
|
51
|
+
|
52
|
+
# puts "Saved Pages: #{saved_pages}"
|
53
|
+
# puts "Empty Pages: #{empty_pages}"
|
54
|
+
|
55
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require_relative 'base'
|
2
|
+
|
3
|
+
module Biblionet
|
4
|
+
module Crawlers
|
5
|
+
|
6
|
+
class PublisherCrawler < Base
|
7
|
+
def initialize(options = {})
|
8
|
+
options[:folder] ||= 'lib/bookshark/storage/html_publisher_pages'
|
9
|
+
options[:base_url] ||= 'http://www.biblionet.gr/com/'
|
10
|
+
options[:page_type] ||= 'publisher'
|
11
|
+
options[:extension] ||= '.html'
|
12
|
+
options[:start] ||= 1
|
13
|
+
options[:finish] ||= 800
|
14
|
+
options[:step] ||= 100
|
15
|
+
super(options)
|
16
|
+
end
|
17
|
+
|
18
|
+
def crawl_and_save
|
19
|
+
downloader = Extractors::Base.new
|
20
|
+
|
21
|
+
spider do |url_to_download, file_to_save|
|
22
|
+
downloader.load_page(url_to_download)
|
23
|
+
|
24
|
+
# Create a new directory (does nothing if directory exists)
|
25
|
+
path = File.dirname(file_to_save)
|
26
|
+
FileUtils.mkdir_p path unless File.directory?(path)
|
27
|
+
|
28
|
+
downloader.save_page(file_to_save) unless downloader.page.nil? or downloader.page.length < 1024
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,116 @@
|
|
1
|
+
require_relative 'base'
|
2
|
+
|
3
|
+
module Biblionet
|
4
|
+
module Extractors
|
5
|
+
|
6
|
+
class AuthorExtractor < Base
|
7
|
+
attr_reader :author
|
8
|
+
|
9
|
+
def initialize(uri=nil)
|
10
|
+
super(uri)
|
11
|
+
extract_author unless uri.nil? or @page.nil?
|
12
|
+
end
|
13
|
+
|
14
|
+
|
15
|
+
def load_and_extract_author(uri=nil)
|
16
|
+
load_page(uri)
|
17
|
+
extract_author unless uri.nil? or @page.nil?
|
18
|
+
end
|
19
|
+
|
20
|
+
# def to_json_pretty
|
21
|
+
# JSON.pretty_generate(@author) unless @author.nil?
|
22
|
+
# end
|
23
|
+
|
24
|
+
def extract_author(biblionet_id=@biblionet_id, author_page=@page)
|
25
|
+
puts "Extracting author: #{biblionet_id}"
|
26
|
+
page = AuthorDataExtractor.new(author_page)
|
27
|
+
|
28
|
+
identity = split_name(page.fullname)
|
29
|
+
|
30
|
+
author_hash = {}
|
31
|
+
if present?(identity[:lastname]) and present?(identity[:firstname])
|
32
|
+
author_hash[:name] = identity[:lastname] + ', ' + identity[:firstname]
|
33
|
+
elsif
|
34
|
+
author_hash[:name] = identity[:lastname]
|
35
|
+
end
|
36
|
+
author_hash[:firstname] = identity[:firstname]
|
37
|
+
author_hash[:lastname] = identity[:lastname]
|
38
|
+
author_hash[:lifetime] = identity[:lifetime]
|
39
|
+
author_hash[:image] = page.image
|
40
|
+
author_hash[:bio] = page.bio
|
41
|
+
author_hash[:award] = page.awards
|
42
|
+
author_hash[:b_id] = biblionet_id
|
43
|
+
|
44
|
+
# puts JSON.pretty_generate(author_hash)
|
45
|
+
|
46
|
+
return @author = author_hash
|
47
|
+
end
|
48
|
+
|
49
|
+
def split_name(fullname)
|
50
|
+
#mathes digits-digits or digits- in text like: Tolkien, John Ronald Reuel, 1892-1973
|
51
|
+
years_re = /\d+-\d*/
|
52
|
+
|
53
|
+
parts = fullname.split(',').map(&:strip)
|
54
|
+
|
55
|
+
identity = {}
|
56
|
+
identity[:lastname] = parts[0]
|
57
|
+
|
58
|
+
if parts.length == 2
|
59
|
+
if parts[1] =~ years_re
|
60
|
+
identity[:lifetime] = parts[1]
|
61
|
+
else
|
62
|
+
identity[:firstname] = parts[1]
|
63
|
+
end
|
64
|
+
elsif parts.length == 3
|
65
|
+
identity[:firstname] = parts[1]
|
66
|
+
identity[:lifetime] = parts[2]
|
67
|
+
end
|
68
|
+
|
69
|
+
return identity
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
|
75
|
+
class AuthorDataExtractor
|
76
|
+
attr_reader :nodeset
|
77
|
+
|
78
|
+
def initialize(document)
|
79
|
+
# No need to operate on whole page. Just on part containing the content.
|
80
|
+
content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
|
81
|
+
if (content_re.match(document)).nil?
|
82
|
+
puts document
|
83
|
+
end
|
84
|
+
content = content_re.match(document)[0]
|
85
|
+
|
86
|
+
@nodeset = Nokogiri::HTML(content)
|
87
|
+
end
|
88
|
+
|
89
|
+
def fullname
|
90
|
+
@nodeset.css('h1.page_title').text
|
91
|
+
end
|
92
|
+
|
93
|
+
def bio
|
94
|
+
@nodeset.css('//p[align="justify"]').text
|
95
|
+
end
|
96
|
+
|
97
|
+
def image
|
98
|
+
img_node = @nodeset.xpath("//img[@src[contains(.,'/persons/')]][1]")
|
99
|
+
img = (img_node.nil? or img_node.empty?) ? nil : BASE_URL+(img_node.first)['src']
|
100
|
+
return img
|
101
|
+
end
|
102
|
+
|
103
|
+
def awards
|
104
|
+
awards = []
|
105
|
+
@nodeset.xpath("//a[@class='booklink' and @href[contains(.,'page=showaward') ]]").each do |item|
|
106
|
+
award = {name: item.text, year: item.next_sibling.text.strip.gsub(/[^\d]/, '')}
|
107
|
+
awards << award
|
108
|
+
end
|
109
|
+
|
110
|
+
return awards
|
111
|
+
end
|
112
|
+
|
113
|
+
end
|
114
|
+
|
115
|
+
end
|
116
|
+
end
|
@@ -0,0 +1,187 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'json'
|
6
|
+
require 'logger'
|
7
|
+
require 'pp'
|
8
|
+
require 'htmlentities'
|
9
|
+
|
10
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '../storage', 'file_manager'))
|
11
|
+
|
12
|
+
module Biblionet
|
13
|
+
module Extractors
|
14
|
+
|
15
|
+
BASE_URL = "http://www.biblionet.gr"
|
16
|
+
|
17
|
+
class Base
|
18
|
+
include FileManager
|
19
|
+
|
20
|
+
attr_reader :filepath, :url, :biblionet_id, :page
|
21
|
+
|
22
|
+
# Initializes the Base class. Without arguments nothing happens. Otherwise loads a page by url or file.
|
23
|
+
#
|
24
|
+
# ==== Attributes
|
25
|
+
#
|
26
|
+
# * +uri+ - It can be a url or a path/to/file.ext on local storage.
|
27
|
+
#
|
28
|
+
def initialize(uri=nil)
|
29
|
+
load_page(uri)
|
30
|
+
end
|
31
|
+
|
32
|
+
# Loads a page from the web or from local file storage depending on passed argument.
|
33
|
+
#
|
34
|
+
# ==== Attributes
|
35
|
+
#
|
36
|
+
# * +uri+ - It can be a url(starting with http/https) or a path/to/file.ext on local storage.
|
37
|
+
#
|
38
|
+
def load_page(uri=nil)
|
39
|
+
if uri.match(/\A#{URI::regexp(['http', 'https'])}\z/)
|
40
|
+
load_page_from_url(uri)
|
41
|
+
else
|
42
|
+
load_page_from_file(uri)
|
43
|
+
end unless uri.nil?
|
44
|
+
end
|
45
|
+
|
46
|
+
# Downloads a page from the web.
|
47
|
+
#
|
48
|
+
# ==== Attributes
|
49
|
+
#
|
50
|
+
# * +url+ - The url of webpage to download.
|
51
|
+
#
|
52
|
+
def load_page_from_url(url)
|
53
|
+
begin
|
54
|
+
@url = url
|
55
|
+
@biblionet_id = url[/\d+(?!.*\d+)/] unless url.nil? # id is expected to be the last number.
|
56
|
+
|
57
|
+
pp "Downloading page: #{url}"
|
58
|
+
open(url, :content_length_proc => lambda do |content_length|
|
59
|
+
raise EmptyPageError.new(url, content_length) unless content_length.nil? or content_length > 1024
|
60
|
+
end) do |f|
|
61
|
+
# pp f.status == ["200", "OK"] ? "success: #{f.status}" : f.status
|
62
|
+
# pp f.meta
|
63
|
+
# pp "Content-Type: " + f.content_type
|
64
|
+
# pp "Content-Size: " + (f.meta)["content-length"]
|
65
|
+
# pp "last modified" + f.last_modified.to_s + is_empty = (f.last_modified.nil?) ? 'Empty' : 'Not Empty'
|
66
|
+
|
67
|
+
@page = f.read.gsub(/\s+/, " ")
|
68
|
+
end
|
69
|
+
rescue Errno::ENOENT => e
|
70
|
+
pp "Page: #{url} NOT FOUND."
|
71
|
+
pp e
|
72
|
+
rescue EmptyPageError => e
|
73
|
+
pp "Page: #{url} is EMPTY."
|
74
|
+
pp e
|
75
|
+
@page = nil
|
76
|
+
rescue OpenURI::HTTPError => e
|
77
|
+
pp e
|
78
|
+
pp e.io.status
|
79
|
+
rescue StandardError => e
|
80
|
+
pp "Generic error #{e.class}. Will wait for 2 minutes and then try again."
|
81
|
+
pp e
|
82
|
+
sleep(120)
|
83
|
+
retry
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# Reads a page from the local file system.
|
88
|
+
#
|
89
|
+
# ==== Attributes
|
90
|
+
#
|
91
|
+
# * +filepath+ - The path to target file which will be read.
|
92
|
+
#
|
93
|
+
def load_page_from_file(filepath)
|
94
|
+
begin
|
95
|
+
@filepath = filepath
|
96
|
+
@biblionet_id = filepath[/\d+(?!.*\d+)/] unless filepath.nil?
|
97
|
+
@page = open(filepath).read
|
98
|
+
rescue StandardError => e
|
99
|
+
puts e
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
# Attr writer method. Changes instance variables url, page and loads a new page by calling load_page_from_url
|
104
|
+
#
|
105
|
+
# ==== Attributes
|
106
|
+
#
|
107
|
+
# * +url+ - The new value of url instance var.
|
108
|
+
#
|
109
|
+
def url=(url)
|
110
|
+
load_page_from_url(url)
|
111
|
+
end
|
112
|
+
|
113
|
+
# Attr writer method. Changes instance variables filepath, page and loads a new page by calling load_page_from_filename
|
114
|
+
#
|
115
|
+
# ==== Attributes
|
116
|
+
#
|
117
|
+
# * +filepath+ - The path to target file which will be read.
|
118
|
+
#
|
119
|
+
def filepath=(filepath)
|
120
|
+
load_page_from_file(filepath)
|
121
|
+
end
|
122
|
+
|
123
|
+
# Saves page to file.
|
124
|
+
#
|
125
|
+
# ==== Attributes
|
126
|
+
#
|
127
|
+
# * +path+ - The path to file(including filename) where content will be saved.
|
128
|
+
#
|
129
|
+
def save_page(path)
|
130
|
+
save_to(path, @page)
|
131
|
+
pp "Saving page: #{path}"
|
132
|
+
end
|
133
|
+
|
134
|
+
# Decodes text with escaped html entities and returns the decoded text.
|
135
|
+
#
|
136
|
+
# ==== Params:
|
137
|
+
#
|
138
|
+
# +encoded_text+:: the text which contains encoded entities
|
139
|
+
#
|
140
|
+
def decode_text(encoded_text)
|
141
|
+
# encoded_text = File.read(encoded_file_path)
|
142
|
+
coder = HTMLEntities.new
|
143
|
+
coder.decode(encoded_text)
|
144
|
+
end
|
145
|
+
|
146
|
+
def present?(value)
|
147
|
+
return (not value.nil? and not value.empty?) ? true : false
|
148
|
+
end
|
149
|
+
|
150
|
+
end
|
151
|
+
|
152
|
+
# Raised when a page is considered empty.
|
153
|
+
#
|
154
|
+
class EmptyPageError < StandardError
|
155
|
+
attr_reader :url, :content_length
|
156
|
+
|
157
|
+
def initialize(url, content_length)
|
158
|
+
@url = url
|
159
|
+
@content_length = content_length
|
160
|
+
|
161
|
+
msg = "Page: #{url} is only #{content_length} bytes, so it is considered EMPTY."
|
162
|
+
super(msg)
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
# Raised when something unexpected or in wrong format is parsed.
|
167
|
+
#
|
168
|
+
class NoIdeaWhatThisIsError < StandardError
|
169
|
+
attr_reader :biblionet_id, :the_unexpected
|
170
|
+
|
171
|
+
def initialize(biblionet_id, the_unexpected)
|
172
|
+
@biblionet_id = biblionet_id
|
173
|
+
@the_unexpected = the_unexpected
|
174
|
+
|
175
|
+
msg = "We have no idea what this: #{the_unexpected} is. At book #{biblionet_id}"
|
176
|
+
super(msg)
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
# page = Parser::ParserBase.parse("http://www.biblionet.gr/book/300525")
|
184
|
+
# pp page.inspect
|
185
|
+
# Parser::ParserBase.parse("http://www.biblionet.gr/book/195243421")
|
186
|
+
# Parser::ParserBase.parse("dsdfds.com")
|
187
|
+
# Parser::ParserBase.save_to("book_195221.html", page)
|