art_paintings_extractor 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +1 -0
- data/.rubocop.yml +19 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE.txt +21 -0
- data/README.md +39 -0
- data/Rakefile +8 -0
- data/files/expected-array-without-images.json +303 -0
- data/files/expected-array.json +391 -0
- data/files/van-gogh-paintings.html +487 -0
- data/files/van-gogh-paintings.json +682 -0
- data/files/van-gogh-paintings.png +0 -0
- data/lib/art_paintings.rb +75 -0
- data/lib/art_paintings_extractor/html_document.rb +43 -0
- data/lib/art_paintings_extractor/version.rb +5 -0
- data/run_extractor.rb +19 -0
- data/sig/van_gogh_paintings_extractor.rbs +4 -0
- metadata +77 -0
Binary file
|
@@ -0,0 +1,75 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'art_paintings_extractor/version'
|
4
|
+
require_relative 'art_paintings_extractor/html_document'
|
5
|
+
|
6
|
+
require 'logger'
|
7
|
+
require 'base64'
|
8
|
+
|
9
|
+
# ArtPaintingsExtractor module
|
10
|
+
module ArtPaintingsExtractor
|
11
|
+
# This class is responsible for extracting painting information from an HTML file.
|
12
|
+
class ArtPaintings
|
13
|
+
attr_reader :doc, :logger
|
14
|
+
|
15
|
+
def initialize(html_file)
|
16
|
+
@logger = Logger.new($stdout)
|
17
|
+
@doc = HtmlDocument.new(html_file).call
|
18
|
+
end
|
19
|
+
|
20
|
+
def paintings
|
21
|
+
@doc.css('.klitem')
|
22
|
+
end
|
23
|
+
|
24
|
+
def extract_paintings
|
25
|
+
paintings.map { |item| build_painting_info(item) }
|
26
|
+
end
|
27
|
+
|
28
|
+
def build_painting_info(item)
|
29
|
+
info = {
|
30
|
+
'name' => extract_name(item),
|
31
|
+
'link' => extract_link(item),
|
32
|
+
'image' => nil
|
33
|
+
# 'image' => extract_image(item)
|
34
|
+
}
|
35
|
+
|
36
|
+
extensions = extract_extensions(item)
|
37
|
+
info['extensions'] = [extensions] unless extensions.empty?
|
38
|
+
|
39
|
+
info
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def extract_name(item)
|
45
|
+
item['aria-label'] || 'Unknown'
|
46
|
+
rescue NoMethodError
|
47
|
+
@logger.warn("Failed to extract name for item: #{item}")
|
48
|
+
'Unknown'
|
49
|
+
end
|
50
|
+
|
51
|
+
def extract_extensions(item)
|
52
|
+
item.css('.ellip.klmeta').text
|
53
|
+
rescue NoMethodError
|
54
|
+
@logger.warn("Failed to extract extensions for item: #{item}")
|
55
|
+
'No extension found for item'
|
56
|
+
end
|
57
|
+
|
58
|
+
def extract_link(item)
|
59
|
+
href = item['href']
|
60
|
+
href ? "https://www.google.com#{href}" : 'No link'
|
61
|
+
rescue NoMethodError
|
62
|
+
@logger.warn("Failed to extract link for item: #{item}")
|
63
|
+
'No link'
|
64
|
+
end
|
65
|
+
|
66
|
+
def extract_image(item)
|
67
|
+
image = item.at_css('g-img > img')
|
68
|
+
image_src = image ? image['src'] : 'No image'
|
69
|
+
image_src || 'No image'
|
70
|
+
rescue NoMethodError
|
71
|
+
@logger.warn("Failed to extract image for item: #{item}")
|
72
|
+
'No image'
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module ArtPaintingsExtractor
|
4
|
+
# Class responsible for reading the contents of an HTML file
|
5
|
+
class HtmlDocument
|
6
|
+
attr_reader :logger
|
7
|
+
|
8
|
+
def initialize(html)
|
9
|
+
@html = html
|
10
|
+
@logger = Logger.new($stdout)
|
11
|
+
end
|
12
|
+
|
13
|
+
def call
|
14
|
+
extract_from_html
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def read_html_file
|
20
|
+
File.read(@html)
|
21
|
+
rescue Errno::ENOENT
|
22
|
+
@logger.error("File not found: #{@html}")
|
23
|
+
nil
|
24
|
+
rescue Errno::EACCES
|
25
|
+
@logger.error("Permission denied: #{@html}")
|
26
|
+
nil
|
27
|
+
rescue StandardError => e
|
28
|
+
@logger.error("Error reading file: #{e.message}")
|
29
|
+
nil
|
30
|
+
end
|
31
|
+
|
32
|
+
def extract_from_html
|
33
|
+
html_file = read_html_file
|
34
|
+
|
35
|
+
return nil unless html_file
|
36
|
+
|
37
|
+
Nokogiri::HTML(read_html_file)
|
38
|
+
rescue Nokogiri::XML::SyntaxError => e
|
39
|
+
@logger.error("Error parsing HTML: #{e.message}")
|
40
|
+
nil
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
data/run_extractor.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
# run_extractor.rb
|
2
|
+
|
3
|
+
require_relative "lib/art_paintings_extractor"
|
4
|
+
|
5
|
+
# Path to the HTML file you want to parse
|
6
|
+
html_file = "./files/van-gogh-paintings.html"
|
7
|
+
|
8
|
+
# Output JSON file
|
9
|
+
# output_file = 'files/extracted_paintings.json'
|
10
|
+
|
11
|
+
# Create an instance of the extractor
|
12
|
+
extractor = ArtPaintingsExtractor::ArtPaintings.new(html_file)
|
13
|
+
|
14
|
+
# Extract paintings and print them
|
15
|
+
paintings = extractor.extract_paintings
|
16
|
+
puts paintings
|
17
|
+
|
18
|
+
# Save paintings to a JSON file
|
19
|
+
# extractor.save_to_file(output_file)
|
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: art_paintings_extractor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jovan
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-07-27 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.15'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.15'
|
27
|
+
description: ArtPaintingsExtractor is a Ruby gem that parses an HTML file to extract
|
28
|
+
information about art paintings.
|
29
|
+
email:
|
30
|
+
- jovansr@pm.me
|
31
|
+
executables: []
|
32
|
+
extensions: []
|
33
|
+
extra_rdoc_files: []
|
34
|
+
files:
|
35
|
+
- ".rspec"
|
36
|
+
- ".rubocop.yml"
|
37
|
+
- CHANGELOG.md
|
38
|
+
- CODE_OF_CONDUCT.md
|
39
|
+
- LICENSE.txt
|
40
|
+
- README.md
|
41
|
+
- Rakefile
|
42
|
+
- files/expected-array-without-images.json
|
43
|
+
- files/expected-array.json
|
44
|
+
- files/van-gogh-paintings.html
|
45
|
+
- files/van-gogh-paintings.json
|
46
|
+
- files/van-gogh-paintings.png
|
47
|
+
- lib/art_paintings.rb
|
48
|
+
- lib/art_paintings_extractor/html_document.rb
|
49
|
+
- lib/art_paintings_extractor/version.rb
|
50
|
+
- run_extractor.rb
|
51
|
+
- sig/van_gogh_paintings_extractor.rbs
|
52
|
+
homepage: https://github.com/jovan-sremacki/code-challenge
|
53
|
+
licenses:
|
54
|
+
- MIT
|
55
|
+
metadata:
|
56
|
+
homepage_uri: https://github.com/jovan-sremacki/code-challenge
|
57
|
+
source_code_uri: https://github.com/jovan-sremacki/code-challenge
|
58
|
+
post_install_message:
|
59
|
+
rdoc_options: []
|
60
|
+
require_paths:
|
61
|
+
- lib
|
62
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: 3.0.0
|
67
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
requirements: []
|
73
|
+
rubygems_version: 3.3.15
|
74
|
+
signing_key:
|
75
|
+
specification_version: 4
|
76
|
+
summary: A tool to extract Art paintings from HTML pages.
|
77
|
+
test_files: []
|