art_paintings_extractor 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.rspec +1 -0
- data/.rubocop.yml +19 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE.txt +21 -0
- data/README.md +39 -0
- data/Rakefile +8 -0
- data/files/expected-array-without-images.json +303 -0
- data/files/expected-array.json +391 -0
- data/files/van-gogh-paintings.html +487 -0
- data/files/van-gogh-paintings.json +682 -0
- data/files/van-gogh-paintings.png +0 -0
- data/lib/art_paintings.rb +75 -0
- data/lib/art_paintings_extractor/html_document.rb +43 -0
- data/lib/art_paintings_extractor/version.rb +5 -0
- data/run_extractor.rb +19 -0
- data/sig/van_gogh_paintings_extractor.rbs +4 -0
- metadata +77 -0
Binary file
|
@@ -0,0 +1,75 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'art_paintings_extractor/version'
|
4
|
+
require_relative 'art_paintings_extractor/html_document'
|
5
|
+
|
6
|
+
require 'logger'
|
7
|
+
require 'base64'
|
8
|
+
|
9
|
+
# ArtPaintingsExtractor module
|
10
|
+
module ArtPaintingsExtractor
|
11
|
+
# This class is responsible for extracting painting information from an HTML file.
|
12
|
+
class ArtPaintings
|
13
|
+
attr_reader :doc, :logger
|
14
|
+
|
15
|
+
def initialize(html_file)
|
16
|
+
@logger = Logger.new($stdout)
|
17
|
+
@doc = HtmlDocument.new(html_file).call
|
18
|
+
end
|
19
|
+
|
20
|
+
def paintings
|
21
|
+
@doc.css('.klitem')
|
22
|
+
end
|
23
|
+
|
24
|
+
def extract_paintings
|
25
|
+
paintings.map { |item| build_painting_info(item) }
|
26
|
+
end
|
27
|
+
|
28
|
+
def build_painting_info(item)
|
29
|
+
info = {
|
30
|
+
'name' => extract_name(item),
|
31
|
+
'link' => extract_link(item),
|
32
|
+
'image' => nil
|
33
|
+
# 'image' => extract_image(item)
|
34
|
+
}
|
35
|
+
|
36
|
+
extensions = extract_extensions(item)
|
37
|
+
info['extensions'] = [extensions] unless extensions.empty?
|
38
|
+
|
39
|
+
info
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def extract_name(item)
|
45
|
+
item['aria-label'] || 'Unknown'
|
46
|
+
rescue NoMethodError
|
47
|
+
@logger.warn("Failed to extract name for item: #{item}")
|
48
|
+
'Unknown'
|
49
|
+
end
|
50
|
+
|
51
|
+
def extract_extensions(item)
|
52
|
+
item.css('.ellip.klmeta').text
|
53
|
+
rescue NoMethodError
|
54
|
+
@logger.warn("Failed to extract extensions for item: #{item}")
|
55
|
+
'No extension found for item'
|
56
|
+
end
|
57
|
+
|
58
|
+
def extract_link(item)
|
59
|
+
href = item['href']
|
60
|
+
href ? "https://www.google.com#{href}" : 'No link'
|
61
|
+
rescue NoMethodError
|
62
|
+
@logger.warn("Failed to extract link for item: #{item}")
|
63
|
+
'No link'
|
64
|
+
end
|
65
|
+
|
66
|
+
def extract_image(item)
|
67
|
+
image = item.at_css('g-img > img')
|
68
|
+
image_src = image ? image['src'] : 'No image'
|
69
|
+
image_src || 'No image'
|
70
|
+
rescue NoMethodError
|
71
|
+
@logger.warn("Failed to extract image for item: #{item}")
|
72
|
+
'No image'
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ArtPaintingsExtractor
|
4
|
+
# Class responsible for reading the contents of an HTML file
|
5
|
+
class HtmlDocument
|
6
|
+
attr_reader :logger
|
7
|
+
|
8
|
+
def initialize(html)
|
9
|
+
@html = html
|
10
|
+
@logger = Logger.new($stdout)
|
11
|
+
end
|
12
|
+
|
13
|
+
def call
|
14
|
+
extract_from_html
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def read_html_file
|
20
|
+
File.read(@html)
|
21
|
+
rescue Errno::ENOENT
|
22
|
+
@logger.error("File not found: #{@html}")
|
23
|
+
nil
|
24
|
+
rescue Errno::EACCES
|
25
|
+
@logger.error("Permission denied: #{@html}")
|
26
|
+
nil
|
27
|
+
rescue StandardError => e
|
28
|
+
@logger.error("Error reading file: #{e.message}")
|
29
|
+
nil
|
30
|
+
end
|
31
|
+
|
32
|
+
def extract_from_html
|
33
|
+
html_file = read_html_file
|
34
|
+
|
35
|
+
return nil unless html_file
|
36
|
+
|
37
|
+
Nokogiri::HTML(read_html_file)
|
38
|
+
rescue Nokogiri::XML::SyntaxError => e
|
39
|
+
@logger.error("Error parsing HTML: #{e.message}")
|
40
|
+
nil
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
data/run_extractor.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
# run_extractor.rb
|
2
|
+
|
3
|
+
require_relative "lib/art_paintings_extractor"
|
4
|
+
|
5
|
+
# Path to the HTML file you want to parse
|
6
|
+
html_file = "./files/van-gogh-paintings.html"
|
7
|
+
|
8
|
+
# Output JSON file
|
9
|
+
# output_file = 'files/extracted_paintings.json'
|
10
|
+
|
11
|
+
# Create an instance of the extractor
|
12
|
+
extractor = ArtPaintingsExtractor::ArtPaintings.new(html_file)
|
13
|
+
|
14
|
+
# Extract paintings and print them
|
15
|
+
paintings = extractor.extract_paintings
|
16
|
+
puts paintings
|
17
|
+
|
18
|
+
# Save paintings to a JSON file
|
19
|
+
# extractor.save_to_file(output_file)
|
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: art_paintings_extractor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jovan
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-07-27 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.15'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.15'
|
27
|
+
description: ArtPaintingsExtractor is a Ruby gem that parses an HTML file to extract
|
28
|
+
information about art paintings.
|
29
|
+
email:
|
30
|
+
- jovansr@pm.me
|
31
|
+
executables: []
|
32
|
+
extensions: []
|
33
|
+
extra_rdoc_files: []
|
34
|
+
files:
|
35
|
+
- ".rspec"
|
36
|
+
- ".rubocop.yml"
|
37
|
+
- CHANGELOG.md
|
38
|
+
- CODE_OF_CONDUCT.md
|
39
|
+
- LICENSE.txt
|
40
|
+
- README.md
|
41
|
+
- Rakefile
|
42
|
+
- files/expected-array-without-images.json
|
43
|
+
- files/expected-array.json
|
44
|
+
- files/van-gogh-paintings.html
|
45
|
+
- files/van-gogh-paintings.json
|
46
|
+
- files/van-gogh-paintings.png
|
47
|
+
- lib/art_paintings.rb
|
48
|
+
- lib/art_paintings_extractor/html_document.rb
|
49
|
+
- lib/art_paintings_extractor/version.rb
|
50
|
+
- run_extractor.rb
|
51
|
+
- sig/van_gogh_paintings_extractor.rbs
|
52
|
+
homepage: https://github.com/jovan-sremacki/code-challenge
|
53
|
+
licenses:
|
54
|
+
- MIT
|
55
|
+
metadata:
|
56
|
+
homepage_uri: https://github.com/jovan-sremacki/code-challenge
|
57
|
+
source_code_uri: https://github.com/jovan-sremacki/code-challenge
|
58
|
+
post_install_message:
|
59
|
+
rdoc_options: []
|
60
|
+
require_paths:
|
61
|
+
- lib
|
62
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: 3.0.0
|
67
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
requirements: []
|
73
|
+
rubygems_version: 3.3.15
|
74
|
+
signing_key:
|
75
|
+
specification_version: 4
|
76
|
+
summary: A tool to extract Art paintings from HTML pages.
|
77
|
+
test_files: []
|