art_paintings_extractor 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Binary file
@@ -0,0 +1,75 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'art_paintings_extractor/version'
4
+ require_relative 'art_paintings_extractor/html_document'
5
+
6
+ require 'logger'
7
+ require 'base64'
8
+
9
+ # ArtPaintingsExtractor module
10
+ module ArtPaintingsExtractor
11
+ # This class is responsible for extracting painting information from an HTML file.
12
+ class ArtPaintings
13
+ attr_reader :doc, :logger
14
+
15
+ def initialize(html_file)
16
+ @logger = Logger.new($stdout)
17
+ @doc = HtmlDocument.new(html_file).call
18
+ end
19
+
20
+ def paintings
21
+ @doc.css('.klitem')
22
+ end
23
+
24
+ def extract_paintings
25
+ paintings.map { |item| build_painting_info(item) }
26
+ end
27
+
28
+ def build_painting_info(item)
29
+ info = {
30
+ 'name' => extract_name(item),
31
+ 'link' => extract_link(item),
32
+ 'image' => nil
33
+ # 'image' => extract_image(item)
34
+ }
35
+
36
+ extensions = extract_extensions(item)
37
+ info['extensions'] = [extensions] unless extensions.empty?
38
+
39
+ info
40
+ end
41
+
42
+ private
43
+
44
+ def extract_name(item)
45
+ item['aria-label'] || 'Unknown'
46
+ rescue NoMethodError
47
+ @logger.warn("Failed to extract name for item: #{item}")
48
+ 'Unknown'
49
+ end
50
+
51
+ def extract_extensions(item)
52
+ item.css('.ellip.klmeta').text
53
+ rescue NoMethodError
54
+ @logger.warn("Failed to extract extensions for item: #{item}")
55
+ 'No extension found for item'
56
+ end
57
+
58
+ def extract_link(item)
59
+ href = item['href']
60
+ href ? "https://www.google.com#{href}" : 'No link'
61
+ rescue NoMethodError
62
+ @logger.warn("Failed to extract link for item: #{item}")
63
+ 'No link'
64
+ end
65
+
66
+ def extract_image(item)
67
+ image = item.at_css('g-img > img')
68
+ image_src = image ? image['src'] : 'No image'
69
+ image_src || 'No image'
70
+ rescue NoMethodError
71
+ @logger.warn("Failed to extract image for item: #{item}")
72
+ 'No image'
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ArtPaintingsExtractor
4
+ # Class responsible for reading the contents of an HTML file
5
+ class HtmlDocument
6
+ attr_reader :logger
7
+
8
+ def initialize(html)
9
+ @html = html
10
+ @logger = Logger.new($stdout)
11
+ end
12
+
13
+ def call
14
+ extract_from_html
15
+ end
16
+
17
+ private
18
+
19
+ def read_html_file
20
+ File.read(@html)
21
+ rescue Errno::ENOENT
22
+ @logger.error("File not found: #{@html}")
23
+ nil
24
+ rescue Errno::EACCES
25
+ @logger.error("Permission denied: #{@html}")
26
+ nil
27
+ rescue StandardError => e
28
+ @logger.error("Error reading file: #{e.message}")
29
+ nil
30
+ end
31
+
32
+ def extract_from_html
33
+ html_file = read_html_file
34
+
35
+ return nil unless html_file
36
+
37
+ Nokogiri::HTML(read_html_file)
38
+ rescue Nokogiri::XML::SyntaxError => e
39
+ @logger.error("Error parsing HTML: #{e.message}")
40
+ nil
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module ArtPaintingsExtractor
4
+ VERSION = "0.1.0"
5
+ end
data/run_extractor.rb ADDED
@@ -0,0 +1,19 @@
1
+ # run_extractor.rb
2
+
3
+ require_relative "lib/art_paintings_extractor"
4
+
5
+ # Path to the HTML file you want to parse
6
+ html_file = "./files/van-gogh-paintings.html"
7
+
8
+ # Output JSON file
9
+ # output_file = 'files/extracted_paintings.json'
10
+
11
+ # Create an instance of the extractor
12
+ extractor = ArtPaintingsExtractor::ArtPaintings.new(html_file)
13
+
14
+ # Extract paintings and print them
15
+ paintings = extractor.extract_paintings
16
+ puts paintings
17
+
18
+ # Save paintings to a JSON file
19
+ # extractor.save_to_file(output_file)
@@ -0,0 +1,4 @@
1
+ module VanGoghPaintingsExtractor
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: art_paintings_extractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jovan
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2024-07-27 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.15'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.15'
27
+ description: ArtPaintingsExtractor is a Ruby gem that parses an HTML file to extract
28
+ information about art paintings.
29
+ email:
30
+ - jovansr@pm.me
31
+ executables: []
32
+ extensions: []
33
+ extra_rdoc_files: []
34
+ files:
35
+ - ".rspec"
36
+ - ".rubocop.yml"
37
+ - CHANGELOG.md
38
+ - CODE_OF_CONDUCT.md
39
+ - LICENSE.txt
40
+ - README.md
41
+ - Rakefile
42
+ - files/expected-array-without-images.json
43
+ - files/expected-array.json
44
+ - files/van-gogh-paintings.html
45
+ - files/van-gogh-paintings.json
46
+ - files/van-gogh-paintings.png
47
+ - lib/art_paintings.rb
48
+ - lib/art_paintings_extractor/html_document.rb
49
+ - lib/art_paintings_extractor/version.rb
50
+ - run_extractor.rb
51
+ - sig/van_gogh_paintings_extractor.rbs
52
+ homepage: https://github.com/jovan-sremacki/code-challenge
53
+ licenses:
54
+ - MIT
55
+ metadata:
56
+ homepage_uri: https://github.com/jovan-sremacki/code-challenge
57
+ source_code_uri: https://github.com/jovan-sremacki/code-challenge
58
+ post_install_message:
59
+ rdoc_options: []
60
+ require_paths:
61
+ - lib
62
+ required_ruby_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: 3.0.0
67
+ required_rubygems_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: '0'
72
+ requirements: []
73
+ rubygems_version: 3.3.15
74
+ signing_key:
75
+ specification_version: 4
76
+ summary: A tool to extract Art paintings from HTML pages.
77
+ test_files: []