bookshark 1.0.0.beta.3 → 1.0.0.beta.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +5 -0
- data/.travis.yml +2 -0
- data/bookshark.gemspec +1 -1
- data/lib/bookshark.rb +50 -3
- data/lib/bookshark/extractors/publisher_extractor.rb +2 -2
- data/lib/bookshark/version.rb +1 -1
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a592c6e055f501c19f4a7dea23bad79446b6b28c
|
4
|
+
data.tar.gz: 797820d896b398e6294f9804a4a42f151682a027
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1f8f7b1b0b0f7312549964153afe8198aab00b0acabdb9693c5e9f7c8242e90a19357fb3507397df6cd5b3278cb1ceb22d2b45dfb4646851e698b5cf2bfeb96d
|
7
|
+
data.tar.gz: 73e4f60098595c1e5b20101bf66c5d24d3cc2e2659c5776a7973635a9789149e5a6d2a2a4a69ff7af126314f25f9b2e31824fd131879f0b3b5939eff55462604
|
data/.gitignore
CHANGED
@@ -8,7 +8,12 @@
|
|
8
8
|
/pkg/
|
9
9
|
/spec/reports/
|
10
10
|
/tmp/
|
11
|
+
/lib/bookshark/storage/html_book_pages/
|
12
|
+
/lib/bookshark/storage/html_author_pages/
|
11
13
|
/lib/bookshark/storage/html_publisher_pages/
|
14
|
+
/lib/bookshark/storage/json_book_pages/
|
15
|
+
/lib/bookshark/storage/json_author_pages/
|
16
|
+
/lib/bookshark/storage/json_publisher_pages/
|
12
17
|
/lib/bookshark/logs/*.log
|
13
18
|
*.bundle
|
14
19
|
*.so
|
data/.travis.yml
CHANGED
data/bookshark.gemspec
CHANGED
@@ -25,7 +25,7 @@ Gem::Specification.new do |spec|
|
|
25
25
|
spec.add_dependency "json", "~> 1.8"
|
26
26
|
spec.add_dependency "htmlentities", "~> 4.3"
|
27
27
|
|
28
|
-
spec.add_development_dependency "bundler", "
|
28
|
+
spec.add_development_dependency "bundler", ">= 1.6"
|
29
29
|
spec.add_development_dependency "rake", "~> 10.0"
|
30
30
|
spec.add_development_dependency 'rspec', "~> 3.2"
|
31
31
|
end
|
data/lib/bookshark.rb
CHANGED
@@ -119,6 +119,54 @@ module Bookshark
|
|
119
119
|
return response
|
120
120
|
end
|
121
121
|
|
122
|
+
def books_from_storage
|
123
|
+
extract_from_storage_and_save('book', 'html_book_pages', 'json_book_pages')
|
124
|
+
end
|
125
|
+
|
126
|
+
def authors_from_storage
|
127
|
+
extract_from_storage_and_save('author', 'html_author_pages', 'json_author_pages')
|
128
|
+
end
|
129
|
+
|
130
|
+
def publishers_from_storage
|
131
|
+
extract_from_storage_and_save('publisher', 'html_publisher_pages', 'json_publisher_pages')
|
132
|
+
end
|
133
|
+
|
134
|
+
def categories_from_storage
|
135
|
+
extract_from_storage_and_save('category', 'html_category_pages', 'json_category_pages')
|
136
|
+
end
|
137
|
+
|
138
|
+
def extract_from_storage_and_save(metadata_type, source_dir, target_dir)
|
139
|
+
list_directories(path: Bookshark.path_to_storage + '/' + source_dir).each do |dir|
|
140
|
+
dir_to_save = dir.gsub(source_dir, target_dir)
|
141
|
+
|
142
|
+
list_files(path: dir, extension: 'html', all:true).each do |file|
|
143
|
+
puts "Extracting from file: " + file.to_s
|
144
|
+
|
145
|
+
# Extract publisher metadata form local file.
|
146
|
+
options = {uri: file, format: 'pretty_json', local: true}
|
147
|
+
|
148
|
+
case metadata_type
|
149
|
+
when 'author'
|
150
|
+
record = author(options)
|
151
|
+
when 'publisher'
|
152
|
+
record = publisher(options)
|
153
|
+
when 'book'
|
154
|
+
record = book(options)
|
155
|
+
when 'category'
|
156
|
+
record = category(options)
|
157
|
+
end
|
158
|
+
|
159
|
+
# Prepare a path to save the new file.
|
160
|
+
filename = File.basename(file,".*")
|
161
|
+
path_to_save = "#{dir_to_save}#{filename}.json"
|
162
|
+
|
163
|
+
# Save to file.
|
164
|
+
save_to("#{path_to_save}", record)
|
165
|
+
|
166
|
+
end # unless File.directory?(dir_to_save) # if dir.end_with? '/195/'
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
122
170
|
def parse_all_categories(will_save=false)
|
123
171
|
# list_directories('raw_ddc_pages').each do |dir|
|
124
172
|
# p dir
|
@@ -166,9 +214,8 @@ module Bookshark
|
|
166
214
|
private
|
167
215
|
|
168
216
|
def process_options(options = {}, caller = nil)
|
169
|
-
# puts
|
170
|
-
|
171
|
-
puts caller
|
217
|
+
# puts "Called from method: " + caller.to_s
|
218
|
+
|
172
219
|
id = options[:id]
|
173
220
|
|
174
221
|
if id
|
@@ -28,7 +28,7 @@ module Biblionet
|
|
28
28
|
|
29
29
|
headquarters = page.headquarters
|
30
30
|
bookstores = page.bookstores
|
31
|
-
bookstores['Έδρα'] = headquarters
|
31
|
+
bookstores['Έδρα'] = headquarters unless headquarters.all? {|k,v| v.nil? or v.empty?}
|
32
32
|
|
33
33
|
publisher_hash = {}
|
34
34
|
publisher_hash[:name] = page.name
|
@@ -97,7 +97,7 @@ module Biblionet
|
|
97
97
|
# Change keys. Use the same as in bookstores.
|
98
98
|
mappings = {"Διεύθυνση" => :address, "Τηλ" => :telephone, "FAX" => :fax, "E-mail" => :email, "Web site" => :website}
|
99
99
|
headquarters_hash = Hash[headquarters_hash.map {|k, v| [mappings[k], v] }]
|
100
|
-
headquarters_hash[:telephone] = [headquarters_hash[:telephone]] unless headquarters_hash[:telephone].kind_of?(Array)
|
100
|
+
headquarters_hash[:telephone] = [headquarters_hash[:telephone]] unless headquarters_hash[:telephone].kind_of?(Array) or headquarters_hash[:telephone].nil?
|
101
101
|
headquarters_hash[:website] = headquarters_hash[:website].split(',').map(&:strip) if (headquarters_hash[:website] and headquarters_hash[:website].include? ',')
|
102
102
|
|
103
103
|
return headquarters_hash
|
data/lib/bookshark/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bookshark
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.0.beta.
|
4
|
+
version: 1.0.0.beta.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dimitris Klisiaris
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-04-
|
11
|
+
date: 2015-04-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -76,16 +76,16 @@ dependencies:
|
|
76
76
|
name: bundler
|
77
77
|
requirement: !ruby/object:Gem::Requirement
|
78
78
|
requirements:
|
79
|
-
- - "
|
79
|
+
- - ">="
|
80
80
|
- !ruby/object:Gem::Version
|
81
|
-
version: '1.
|
81
|
+
version: '1.6'
|
82
82
|
type: :development
|
83
83
|
prerelease: false
|
84
84
|
version_requirements: !ruby/object:Gem::Requirement
|
85
85
|
requirements:
|
86
|
-
- - "
|
86
|
+
- - ">="
|
87
87
|
- !ruby/object:Gem::Version
|
88
|
-
version: '1.
|
88
|
+
version: '1.6'
|
89
89
|
- !ruby/object:Gem::Dependency
|
90
90
|
name: rake
|
91
91
|
requirement: !ruby/object:Gem::Requirement
|