art_paintings_extractor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
@@ -0,0 +1,75 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'art_paintings_extractor/version'
4
+ require_relative 'art_paintings_extractor/html_document'
5
+
6
+ require 'logger'
7
+ require 'base64'
8
+
9
+ # ArtPaintingsExtractor module
10
+ module ArtPaintingsExtractor
11
+ # This class is responsible for extracting painting information from an HTML file.
12
+ class ArtPaintings
13
+ attr_reader :doc, :logger
14
+
15
+ def initialize(html_file)
16
+ @logger = Logger.new($stdout)
17
+ @doc = HtmlDocument.new(html_file).call
18
+ end
19
+
20
+ def paintings
21
+ @doc.css('.klitem')
22
+ end
23
+
24
+ def extract_paintings
25
+ paintings.map { |item| build_painting_info(item) }
26
+ end
27
+
28
+ def build_painting_info(item)
29
+ info = {
30
+ 'name' => extract_name(item),
31
+ 'link' => extract_link(item),
32
+ 'image' => nil
33
+ # 'image' => extract_image(item)
34
+ }
35
+
36
+ extensions = extract_extensions(item)
37
+ info['extensions'] = [extensions] unless extensions.empty?
38
+
39
+ info
40
+ end
41
+
42
+ private
43
+
44
+ def extract_name(item)
45
+ item['aria-label'] || 'Unknown'
46
+ rescue NoMethodError
47
+ @logger.warn("Failed to extract name for item: #{item}")
48
+ 'Unknown'
49
+ end
50
+
51
+ def extract_extensions(item)
52
+ item.css('.ellip.klmeta').text
53
+ rescue NoMethodError
54
+ @logger.warn("Failed to extract extensions for item: #{item}")
55
+ 'No extension found for item'
56
+ end
57
+
58
+ def extract_link(item)
59
+ href = item['href']
60
+ href ? "https://www.google.com#{href}" : 'No link'
61
+ rescue NoMethodError
62
+ @logger.warn("Failed to extract link for item: #{item}")
63
+ 'No link'
64
+ end
65
+
66
+ def extract_image(item)
67
+ image = item.at_css('g-img > img')
68
+ image_src = image ? image['src'] : 'No image'
69
+ image_src || 'No image'
70
+ rescue NoMethodError
71
+ @logger.warn("Failed to extract image for item: #{item}")
72
+ 'No image'
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ArtPaintingsExtractor
4
+ # Class responsible for reading the contents of an HTML file
5
+ class HtmlDocument
6
+ attr_reader :logger
7
+
8
+ def initialize(html)
9
+ @html = html
10
+ @logger = Logger.new($stdout)
11
+ end
12
+
13
+ def call
14
+ extract_from_html
15
+ end
16
+
17
+ private
18
+
19
+ def read_html_file
20
+ File.read(@html)
21
+ rescue Errno::ENOENT
22
+ @logger.error("File not found: #{@html}")
23
+ nil
24
+ rescue Errno::EACCES
25
+ @logger.error("Permission denied: #{@html}")
26
+ nil
27
+ rescue StandardError => e
28
+ @logger.error("Error reading file: #{e.message}")
29
+ nil
30
+ end
31
+
32
+ def extract_from_html
33
+ html_file = read_html_file
34
+
35
+ return nil unless html_file
36
+
37
+ Nokogiri::HTML(read_html_file)
38
+ rescue Nokogiri::XML::SyntaxError => e
39
+ @logger.error("Error parsing HTML: #{e.message}")
40
+ nil
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ArtPaintingsExtractor
4
+ VERSION = "0.1.0"
5
+ end
data/run_extractor.rb ADDED
@@ -0,0 +1,19 @@
1
+ # run_extractor.rb
2
+
3
+ require_relative "lib/art_paintings_extractor"
4
+
5
+ # Path to the HTML file you want to parse
6
+ html_file = "./files/van-gogh-paintings.html"
7
+
8
+ # Output JSON file
9
+ # output_file = 'files/extracted_paintings.json'
10
+
11
+ # Create an instance of the extractor
12
+ extractor = ArtPaintingsExtractor::ArtPaintings.new(html_file)
13
+
14
+ # Extract paintings and print them
15
+ paintings = extractor.extract_paintings
16
+ puts paintings
17
+
18
+ # Save paintings to a JSON file
19
+ # extractor.save_to_file(output_file)
@@ -0,0 +1,4 @@
1
+ module VanGoghPaintingsExtractor
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: art_paintings_extractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jovan
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2024-07-27 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.15'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.15'
27
+ description: ArtPaintingsExtractor is a Ruby gem that parses an HTML file to extract
28
+ information about art paintings.
29
+ email:
30
+ - jovansr@pm.me
31
+ executables: []
32
+ extensions: []
33
+ extra_rdoc_files: []
34
+ files:
35
+ - ".rspec"
36
+ - ".rubocop.yml"
37
+ - CHANGELOG.md
38
+ - CODE_OF_CONDUCT.md
39
+ - LICENSE.txt
40
+ - README.md
41
+ - Rakefile
42
+ - files/expected-array-without-images.json
43
+ - files/expected-array.json
44
+ - files/van-gogh-paintings.html
45
+ - files/van-gogh-paintings.json
46
+ - files/van-gogh-paintings.png
47
+ - lib/art_paintings.rb
48
+ - lib/art_paintings_extractor/html_document.rb
49
+ - lib/art_paintings_extractor/version.rb
50
+ - run_extractor.rb
51
+ - sig/van_gogh_paintings_extractor.rbs
52
+ homepage: https://github.com/jovan-sremacki/code-challenge
53
+ licenses:
54
+ - MIT
55
+ metadata:
56
+ homepage_uri: https://github.com/jovan-sremacki/code-challenge
57
+ source_code_uri: https://github.com/jovan-sremacki/code-challenge
58
+ post_install_message:
59
+ rdoc_options: []
60
+ require_paths:
61
+ - lib
62
+ required_ruby_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: 3.0.0
67
+ required_rubygems_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: '0'
72
+ requirements: []
73
+ rubygems_version: 3.3.15
74
+ signing_key:
75
+ specification_version: 4
76
+ summary: A tool to extract Art paintings from HTML pages.
77
+ test_files: []