bookshark 1.0.0.beta.3 → 1.0.0.beta.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +5 -0
- data/.travis.yml +2 -0
- data/bookshark.gemspec +1 -1
- data/lib/bookshark.rb +50 -3
- data/lib/bookshark/extractors/publisher_extractor.rb +2 -2
- data/lib/bookshark/version.rb +1 -1
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a592c6e055f501c19f4a7dea23bad79446b6b28c
|
4
|
+
data.tar.gz: 797820d896b398e6294f9804a4a42f151682a027
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1f8f7b1b0b0f7312549964153afe8198aab00b0acabdb9693c5e9f7c8242e90a19357fb3507397df6cd5b3278cb1ceb22d2b45dfb4646851e698b5cf2bfeb96d
|
7
|
+
data.tar.gz: 73e4f60098595c1e5b20101bf66c5d24d3cc2e2659c5776a7973635a9789149e5a6d2a2a4a69ff7af126314f25f9b2e31824fd131879f0b3b5939eff55462604
|
data/.gitignore
CHANGED
@@ -8,7 +8,12 @@
|
|
8
8
|
/pkg/
|
9
9
|
/spec/reports/
|
10
10
|
/tmp/
|
11
|
+
/lib/bookshark/storage/html_book_pages/
|
12
|
+
/lib/bookshark/storage/html_author_pages/
|
11
13
|
/lib/bookshark/storage/html_publisher_pages/
|
14
|
+
/lib/bookshark/storage/json_book_pages/
|
15
|
+
/lib/bookshark/storage/json_author_pages/
|
16
|
+
/lib/bookshark/storage/json_publisher_pages/
|
12
17
|
/lib/bookshark/logs/*.log
|
13
18
|
*.bundle
|
14
19
|
*.so
|
data/.travis.yml
CHANGED
data/bookshark.gemspec
CHANGED
@@ -25,7 +25,7 @@ Gem::Specification.new do |spec|
|
|
25
25
|
spec.add_dependency "json", "~> 1.8"
|
26
26
|
spec.add_dependency "htmlentities", "~> 4.3"
|
27
27
|
|
28
|
-
spec.add_development_dependency "bundler", "
|
28
|
+
spec.add_development_dependency "bundler", ">= 1.6"
|
29
29
|
spec.add_development_dependency "rake", "~> 10.0"
|
30
30
|
spec.add_development_dependency 'rspec', "~> 3.2"
|
31
31
|
end
|
data/lib/bookshark.rb
CHANGED
@@ -119,6 +119,54 @@ module Bookshark
|
|
119
119
|
return response
|
120
120
|
end
|
121
121
|
|
122
|
+
def books_from_storage
|
123
|
+
extract_from_storage_and_save('book', 'html_book_pages', 'json_book_pages')
|
124
|
+
end
|
125
|
+
|
126
|
+
def authors_from_storage
|
127
|
+
extract_from_storage_and_save('author', 'html_author_pages', 'json_author_pages')
|
128
|
+
end
|
129
|
+
|
130
|
+
def publishers_from_storage
|
131
|
+
extract_from_storage_and_save('publisher', 'html_publisher_pages', 'json_publisher_pages')
|
132
|
+
end
|
133
|
+
|
134
|
+
def categories_from_storage
|
135
|
+
extract_from_storage_and_save('category', 'html_category_pages', 'json_category_pages')
|
136
|
+
end
|
137
|
+
|
138
|
+
def extract_from_storage_and_save(metadata_type, source_dir, target_dir)
|
139
|
+
list_directories(path: Bookshark.path_to_storage + '/' + source_dir).each do |dir|
|
140
|
+
dir_to_save = dir.gsub(source_dir, target_dir)
|
141
|
+
|
142
|
+
list_files(path: dir, extension: 'html', all:true).each do |file|
|
143
|
+
puts "Extracting from file: " + file.to_s
|
144
|
+
|
145
|
+
# Extract publisher metadata form local file.
|
146
|
+
options = {uri: file, format: 'pretty_json', local: true}
|
147
|
+
|
148
|
+
case metadata_type
|
149
|
+
when 'author'
|
150
|
+
record = author(options)
|
151
|
+
when 'publisher'
|
152
|
+
record = publisher(options)
|
153
|
+
when 'book'
|
154
|
+
record = book(options)
|
155
|
+
when 'category'
|
156
|
+
record = category(options)
|
157
|
+
end
|
158
|
+
|
159
|
+
# Prepare a path to save the new file.
|
160
|
+
filename = File.basename(file,".*")
|
161
|
+
path_to_save = "#{dir_to_save}#{filename}.json"
|
162
|
+
|
163
|
+
# Save to file.
|
164
|
+
save_to("#{path_to_save}", record)
|
165
|
+
|
166
|
+
end # unless File.directory?(dir_to_save) # if dir.end_with? '/195/'
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
122
170
|
def parse_all_categories(will_save=false)
|
123
171
|
# list_directories('raw_ddc_pages').each do |dir|
|
124
172
|
# p dir
|
@@ -166,9 +214,8 @@ module Bookshark
|
|
166
214
|
private
|
167
215
|
|
168
216
|
def process_options(options = {}, caller = nil)
|
169
|
-
# puts
|
170
|
-
|
171
|
-
puts caller
|
217
|
+
# puts "Called from method: " + caller.to_s
|
218
|
+
|
172
219
|
id = options[:id]
|
173
220
|
|
174
221
|
if id
|
@@ -28,7 +28,7 @@ module Biblionet
|
|
28
28
|
|
29
29
|
headquarters = page.headquarters
|
30
30
|
bookstores = page.bookstores
|
31
|
-
bookstores['Έδρα'] = headquarters
|
31
|
+
bookstores['Έδρα'] = headquarters unless headquarters.all? {|k,v| v.nil? or v.empty?}
|
32
32
|
|
33
33
|
publisher_hash = {}
|
34
34
|
publisher_hash[:name] = page.name
|
@@ -97,7 +97,7 @@ module Biblionet
|
|
97
97
|
# Change keys. Use the same as in bookstores.
|
98
98
|
mappings = {"Διεύθυνση" => :address, "Τηλ" => :telephone, "FAX" => :fax, "E-mail" => :email, "Web site" => :website}
|
99
99
|
headquarters_hash = Hash[headquarters_hash.map {|k, v| [mappings[k], v] }]
|
100
|
-
headquarters_hash[:telephone] = [headquarters_hash[:telephone]] unless headquarters_hash[:telephone].kind_of?(Array)
|
100
|
+
headquarters_hash[:telephone] = [headquarters_hash[:telephone]] unless headquarters_hash[:telephone].kind_of?(Array) or headquarters_hash[:telephone].nil?
|
101
101
|
headquarters_hash[:website] = headquarters_hash[:website].split(',').map(&:strip) if (headquarters_hash[:website] and headquarters_hash[:website].include? ',')
|
102
102
|
|
103
103
|
return headquarters_hash
|
data/lib/bookshark/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bookshark
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.0.beta.
|
4
|
+
version: 1.0.0.beta.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dimitris Klisiaris
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-04-
|
11
|
+
date: 2015-04-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -76,16 +76,16 @@ dependencies:
|
|
76
76
|
name: bundler
|
77
77
|
requirement: !ruby/object:Gem::Requirement
|
78
78
|
requirements:
|
79
|
-
- - "
|
79
|
+
- - ">="
|
80
80
|
- !ruby/object:Gem::Version
|
81
|
-
version: '1.
|
81
|
+
version: '1.6'
|
82
82
|
type: :development
|
83
83
|
prerelease: false
|
84
84
|
version_requirements: !ruby/object:Gem::Requirement
|
85
85
|
requirements:
|
86
|
-
- - "
|
86
|
+
- - ">="
|
87
87
|
- !ruby/object:Gem::Version
|
88
|
-
version: '1.
|
88
|
+
version: '1.6'
|
89
89
|
- !ruby/object:Gem::Dependency
|
90
90
|
name: rake
|
91
91
|
requirement: !ruby/object:Gem::Requirement
|