bookshark 1.0.0.alpha.3 → 1.0.0.alpha.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/bookshark/extractors/author_extractor.rb +7 -2
- data/lib/bookshark/extractors/book_extractor.rb +14 -5
- data/lib/bookshark/extractors/category_extractor.rb +9 -6
- data/lib/bookshark/extractors/publisher_extractor.rb +10 -3
- data/lib/bookshark/version.rb +1 -1
- data/spec/bookshark_spec.rb +116 -19
- data/spec/test_data/empty_author.json +5 -0
- data/spec/test_data/empty_book.json +5 -0
- data/spec/test_data/empty_category.json +5 -0
- data/spec/test_data/empty_publisher.json +5 -0
- metadata +9 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 818b101a82314fcff676b111f3e407821a9f0dc5
|
4
|
+
data.tar.gz: 12dd9368013a7911b4ce8c3f799db90e42c80671
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b937fcae31844c3742ff6ad00ef91e27a24fe2d8f67c2319fbafeab5db16e1fdc382e26fb37a9d1409ba446fefe8c4407b9cc3d7d95038da6533e09aac2a8900
|
7
|
+
data.tar.gz: 70932725282459f1b6c161517630c5a8edd364efca16d4f2e41b1170ebba721368b9fef14eab7d9fa710c2ebd2e099374d76f53ee15c40b9df0dfa99799eab0e
|
@@ -43,7 +43,12 @@ module Biblionet
|
|
43
43
|
|
44
44
|
# puts JSON.pretty_generate(author_hash)
|
45
45
|
|
46
|
-
|
46
|
+
if author_hash[:lastname].nil? and author_hash[:firstname].nil?
|
47
|
+
return nil
|
48
|
+
else
|
49
|
+
return @author = author_hash
|
50
|
+
end
|
51
|
+
|
47
52
|
end
|
48
53
|
|
49
54
|
def split_name(fullname)
|
@@ -81,7 +86,7 @@ module Biblionet
|
|
81
86
|
if (content_re.match(document)).nil?
|
82
87
|
puts document
|
83
88
|
end
|
84
|
-
content = content_re.match(document)[0]
|
89
|
+
content = content_re.match(document)[0] unless (content_re.match(document)).nil?
|
85
90
|
|
86
91
|
@nodeset = Nokogiri::HTML(content)
|
87
92
|
end
|
@@ -9,12 +9,12 @@ module Biblionet
|
|
9
9
|
|
10
10
|
def initialize(uri=nil)
|
11
11
|
super(uri)
|
12
|
-
extract_book unless uri.nil?
|
12
|
+
extract_book unless uri.nil? or @page.nil?
|
13
13
|
end
|
14
14
|
|
15
15
|
def load_and_extract_book(uri=nil)
|
16
16
|
load_page(uri)
|
17
|
-
extract_book unless uri.nil?
|
17
|
+
extract_book unless uri.nil? or @page.nil?
|
18
18
|
end
|
19
19
|
|
20
20
|
# Converts the parsed contributors string to hash.
|
@@ -116,6 +116,10 @@ module Biblionet
|
|
116
116
|
|
117
117
|
page = BookDataExtractor.new(book_page)
|
118
118
|
|
119
|
+
# End extraction if BookDataExtractor couldnt create a nodeset
|
120
|
+
return nil if page.nodeset.nil?
|
121
|
+
|
122
|
+
|
119
123
|
book_hash = Hash.new
|
120
124
|
|
121
125
|
begin
|
@@ -200,9 +204,14 @@ module Biblionet
|
|
200
204
|
if (content_re.match(document)).nil?
|
201
205
|
puts document
|
202
206
|
end
|
203
|
-
content = content_re.match(document)[0]
|
204
|
-
|
205
|
-
|
207
|
+
content = content_re.match(document)[0] unless (content_re.match(document)).nil?
|
208
|
+
|
209
|
+
# If content is nil, there is something wrong with the html, so return nil
|
210
|
+
if content.nil?
|
211
|
+
@nodeset = nil
|
212
|
+
else
|
213
|
+
@nodeset = Nokogiri::HTML(content)
|
214
|
+
end
|
206
215
|
end
|
207
216
|
|
208
217
|
def image
|
@@ -8,7 +8,7 @@ module Biblionet
|
|
8
8
|
|
9
9
|
def initialize(uri=nil)
|
10
10
|
super(uri)
|
11
|
-
extract_categories unless uri.nil?
|
11
|
+
extract_categories unless uri.nil? or @page.nil?
|
12
12
|
end
|
13
13
|
|
14
14
|
def extract_categories(category_page=@page)
|
@@ -43,15 +43,18 @@ module Biblionet
|
|
43
43
|
category_hash = {biblionet_id => category.clone}
|
44
44
|
end.reduce({}, :update) unless @page.nil?
|
45
45
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
46
|
+
if present?(@categories)
|
47
|
+
@categories[:current] = (@categories[@biblionet_id.to_s].clone)
|
48
|
+
@categories[:current][:b_id] = @biblionet_id
|
49
|
+
return @categories
|
50
|
+
else
|
51
|
+
return nil
|
52
|
+
end
|
50
53
|
end
|
51
54
|
|
52
55
|
def extract_categories_from(uri=nil)
|
53
56
|
load_page(uri)
|
54
|
-
extract_categories unless uri.nil?
|
57
|
+
extract_categories unless uri.nil? or @page.nil?
|
55
58
|
end
|
56
59
|
|
57
60
|
|
@@ -21,6 +21,8 @@ module Biblionet
|
|
21
21
|
puts "Extracting publisher: #{biblionet_id}"
|
22
22
|
page = PublisherDataExtractor.new(publisher_page)
|
23
23
|
|
24
|
+
return nil if page.nodeset.nil?
|
25
|
+
|
24
26
|
headquarters = page.headquarters
|
25
27
|
bookstores = page.bookstores
|
26
28
|
bookstores['Έδρα'] = headquarters
|
@@ -46,9 +48,14 @@ module Biblionet
|
|
46
48
|
if (content_re.match(document)).nil?
|
47
49
|
puts document
|
48
50
|
end
|
49
|
-
content = content_re.match(document)[0]
|
50
|
-
|
51
|
-
|
51
|
+
content = content_re.match(document)[0] unless (content_re.match(document)).nil?
|
52
|
+
|
53
|
+
# If content is nil, there is something wrong with the html, so return nil
|
54
|
+
if content.nil?
|
55
|
+
@nodeset = nil
|
56
|
+
else
|
57
|
+
@nodeset = Nokogiri::HTML(content)
|
58
|
+
end
|
52
59
|
end
|
53
60
|
|
54
61
|
def name
|
data/lib/bookshark/version.rb
CHANGED
data/spec/bookshark_spec.rb
CHANGED
@@ -9,13 +9,37 @@ describe Bookshark::Extractor do
|
|
9
9
|
let(:eager_book_184923) { JSON.pretty_generate(JSON.parse(open(File.expand_path("test_data/eager_book_184923.json", File.dirname(__FILE__))).read)) }
|
10
10
|
let(:search_01) { JSON.pretty_generate(JSON.parse(open(File.expand_path("test_data/search_01.json" , File.dirname(__FILE__))).read)) }
|
11
11
|
let(:search_ids_01) { JSON.pretty_generate(JSON.parse(open(File.expand_path("test_data/search_ids_01.json" , File.dirname(__FILE__))).read)) }
|
12
|
+
let(:empty_book) { JSON.pretty_generate(JSON.parse(open(File.expand_path("test_data/empty_book.json" , File.dirname(__FILE__))).read)) }
|
13
|
+
let(:empty_author) { JSON.pretty_generate(JSON.parse(open(File.expand_path("test_data/empty_author.json" , File.dirname(__FILE__))).read)) }
|
14
|
+
let(:empty_publisher) { JSON.pretty_generate(JSON.parse(open(File.expand_path("test_data/empty_publisher.json" , File.dirname(__FILE__))).read)) }
|
15
|
+
let(:empty_category) { JSON.pretty_generate(JSON.parse(open(File.expand_path("test_data/empty_category.json" , File.dirname(__FILE__))).read)) }
|
12
16
|
|
13
17
|
|
14
18
|
describe '#author' do
|
15
19
|
context 'from remote html source' do
|
16
|
-
|
17
|
-
|
20
|
+
context 'when the author exists' do
|
21
|
+
it 'reads html from the web based on given id and extracts author data' do
|
22
|
+
expect(subject.author(id: 13219)).to eq author_13219
|
23
|
+
end
|
24
|
+
it 'reads html from the web based on given uri and extracts author data' do
|
25
|
+
expect(subject.author(uri: "http://biblionet.gr/author/13219")).to eq author_13219
|
26
|
+
end
|
18
27
|
end
|
28
|
+
context 'when the author doesnt exist' do
|
29
|
+
it 'returns an empty array' do
|
30
|
+
expect(subject.author(id: 0)).to eq empty_author
|
31
|
+
end
|
32
|
+
end
|
33
|
+
context 'when no options are set' do
|
34
|
+
it 'returns an empty array' do
|
35
|
+
expect(subject.author).to eq empty_author
|
36
|
+
end
|
37
|
+
end
|
38
|
+
context 'when a the given uri is wrong' do
|
39
|
+
it 'returns an empty array' do
|
40
|
+
expect(subject.author(uri: "http://google.com")).to eq empty_author
|
41
|
+
end
|
42
|
+
end
|
19
43
|
end
|
20
44
|
|
21
45
|
context 'from local storage' do
|
@@ -28,41 +52,114 @@ describe Bookshark::Extractor do
|
|
28
52
|
|
29
53
|
describe '#publisher' do
|
30
54
|
context 'extract from remote html source' do
|
31
|
-
|
32
|
-
|
55
|
+
context 'when the publisher exists' do
|
56
|
+
it 'reads html from the web based on given id and extracts publisher data' do
|
57
|
+
expect(subject.publisher(id: 20)).to eq publisher_20
|
58
|
+
end
|
59
|
+
it 'reads html from the web based on given uri and extracts publisher data' do
|
60
|
+
expect(subject.publisher(uri: "http://biblionet.gr/com/20")).to eq publisher_20
|
61
|
+
end
|
33
62
|
end
|
63
|
+
context 'when the publisher doesnt exist' do
|
64
|
+
it 'returns an empty array' do
|
65
|
+
expect(subject.publisher(id: 0)).to eq empty_publisher
|
66
|
+
end
|
67
|
+
end
|
68
|
+
context 'when no options are set' do
|
69
|
+
it 'returns an empty array' do
|
70
|
+
expect(subject.publisher).to eq empty_publisher
|
71
|
+
end
|
72
|
+
end
|
73
|
+
context 'when a the given uri is wrong' do
|
74
|
+
it 'returns an empty array' do
|
75
|
+
expect(subject.publisher(uri: "http://google.com")).to eq empty_publisher
|
76
|
+
end
|
77
|
+
end
|
34
78
|
end
|
35
79
|
end
|
36
80
|
|
37
81
|
describe '#category' do
|
38
82
|
context 'extract from remote html source' do
|
39
|
-
|
40
|
-
|
83
|
+
context 'when the category exists' do
|
84
|
+
it 'reads html from the web based on given id and extracts category data' do
|
85
|
+
expect(subject.category(id: 1041)).to eq category_1041
|
86
|
+
end
|
87
|
+
it 'reads html from the web based on given uri and extracts category data' do
|
88
|
+
expect(subject.category(uri: "http://biblionet.gr/index/1041")).to eq category_1041
|
89
|
+
end
|
41
90
|
end
|
91
|
+
context 'when the category doesnt exist' do
|
92
|
+
it 'returns an empty array' do
|
93
|
+
expect(subject.category(id: 0)).to eq empty_category
|
94
|
+
end
|
95
|
+
end
|
96
|
+
context 'when no options are set' do
|
97
|
+
it 'returns an empty array' do
|
98
|
+
expect(subject.category).to eq empty_category
|
99
|
+
end
|
100
|
+
end
|
101
|
+
context 'when a the given uri is wrong' do
|
102
|
+
it 'returns an empty array' do
|
103
|
+
expect(subject.category(uri: "http://google.com")).to eq empty_category
|
104
|
+
end
|
105
|
+
end
|
42
106
|
end
|
43
107
|
end
|
44
108
|
|
45
109
|
describe '#book' do
|
46
110
|
context 'extract from remote html source' do
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
111
|
+
context 'when book exists' do
|
112
|
+
it 'reads html from the web based on given id and extracts book data' do
|
113
|
+
expect(subject.book(id: 103788)).to eq book_103788
|
114
|
+
end
|
115
|
+
|
116
|
+
it 'reads html from the web based on given uri and extracts book data' do
|
117
|
+
expect(subject.book(uri: "http://biblionet.gr/book/103788")).to eq book_103788
|
118
|
+
end
|
119
|
+
|
120
|
+
it 'reads html from the web and eager extracts all book and reference data' do
|
121
|
+
expect(subject.book(id: 184923, eager: true)).to eq eager_book_184923
|
122
|
+
end
|
53
123
|
end
|
124
|
+
context 'when the book doesnt exist' do
|
125
|
+
it 'returns an empty array' do
|
126
|
+
expect(subject.book(id: 0)).to eq empty_book
|
127
|
+
end
|
128
|
+
end
|
129
|
+
context 'when no options are set' do
|
130
|
+
it 'returns an empty array' do
|
131
|
+
expect(subject.book).to eq empty_book
|
132
|
+
end
|
133
|
+
end
|
134
|
+
context 'when a the given uri is wrong' do
|
135
|
+
it 'returns an empty array' do
|
136
|
+
expect(subject.book(uri: "http://google.com")).to eq empty_book
|
137
|
+
end
|
138
|
+
end
|
54
139
|
end
|
55
140
|
end
|
56
141
|
|
57
142
|
describe '#search' do
|
58
|
-
context 'extract from remote html source' do
|
59
|
-
|
60
|
-
|
143
|
+
context 'search and extract from remote html source' do
|
144
|
+
context 'when books are found' do
|
145
|
+
it 'builds a search url and extracts book ids from search page' do
|
146
|
+
expect(subject.search(title: 'σημεια και τερατα', results_type: 'ids')).to eq search_ids_01
|
147
|
+
end
|
148
|
+
|
149
|
+
it 'builds a search url and extracts book data from search page' do
|
150
|
+
expect(subject.search(title: 'σημεια και τερατα', results_type: 'metadata')).to eq search_01
|
151
|
+
end
|
61
152
|
end
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
153
|
+
context 'when no books are found' do
|
154
|
+
it 'returns an empty array' do
|
155
|
+
expect(subject.search(isbn: 'some-invalid-isbn')).to eq empty_book
|
156
|
+
end
|
157
|
+
end
|
158
|
+
context 'when no options are set' do
|
159
|
+
it 'returns an empty array' do
|
160
|
+
expect(subject.search).to eq empty_book
|
161
|
+
end
|
162
|
+
end
|
66
163
|
end
|
67
164
|
end
|
68
165
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bookshark
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.0.alpha.
|
4
|
+
version: 1.0.0.alpha.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dimitris Klisiaris
|
@@ -149,6 +149,10 @@ files:
|
|
149
149
|
- spec/test_data/book_103788.json
|
150
150
|
- spec/test_data/category_1041.json
|
151
151
|
- spec/test_data/eager_book_184923.json
|
152
|
+
- spec/test_data/empty_author.json
|
153
|
+
- spec/test_data/empty_book.json
|
154
|
+
- spec/test_data/empty_category.json
|
155
|
+
- spec/test_data/empty_publisher.json
|
152
156
|
- spec/test_data/publisher_20.json
|
153
157
|
- spec/test_data/search_01.json
|
154
158
|
- spec/test_data/search_ids_01.json
|
@@ -186,6 +190,10 @@ test_files:
|
|
186
190
|
- spec/test_data/book_103788.json
|
187
191
|
- spec/test_data/category_1041.json
|
188
192
|
- spec/test_data/eager_book_184923.json
|
193
|
+
- spec/test_data/empty_author.json
|
194
|
+
- spec/test_data/empty_book.json
|
195
|
+
- spec/test_data/empty_category.json
|
196
|
+
- spec/test_data/empty_publisher.json
|
189
197
|
- spec/test_data/publisher_20.json
|
190
198
|
- spec/test_data/search_01.json
|
191
199
|
- spec/test_data/search_ids_01.json
|