RubyGems - hiroiyomi - Versions diffs - 0.1.1 → 0.1.2 - Mend

hiroiyomi 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/.ruby-version +1 -0
data/README.md +1 -3
data/lib/hiroiyomi/html/attribute.rb +48 -2
data/lib/hiroiyomi/html/childable.rb +12 -0
data/lib/hiroiyomi/html/document.rb +14 -6
data/lib/hiroiyomi/html/dom_parser.rb +38 -0
data/lib/hiroiyomi/html/dom_parser_helper.rb +70 -0
data/lib/hiroiyomi/html/element.rb +164 -6
data/lib/hiroiyomi/html/text.rb +88 -0
data/lib/hiroiyomi/parser.rb +4 -12
data/lib/hiroiyomi/version.rb +1 -1
data/lib/hiroiyomi.rb +4 -3
metadata +7 -3
data/lib/hiroiyomi/html_parser.rb +0 -191

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 16f588a0df9ea7f70429f272c27e70b3de34378959999be3c09b0a9d535c8040
-  data.tar.gz: f5a2922e84a4b42445437a81e3e3eebf6117c044d7a25666ceeeed29c0c2350e
+  metadata.gz: 0661022f19059a23cf6ec690cb47bb531f2b08225197e487648d91a8f31df1d7
+  data.tar.gz: 813da1519fe1b3da7e41775d5ca6b63198d5846e8f1624f95d9b82df0f46b35c
 SHA512:
-  metadata.gz: cd0b643422d1b5b0807fd872cb5c2db60e7ec20ef491c9d6c69921b10871d9f49e762bb06f249f1d65adbb2be8229fbfa184aa22ecda77f59324354aab72e1fd
-  data.tar.gz: b4debfcfd5dbcc3519c2d85544b97d4b984d00c59eb50c5a4f54e2e1d683605c468e6c403c57bf4caf2d92622f75e8c8593d2c40d59d6302acbb9a1eca99e8b3
+  metadata.gz: 0bd9bc0554a39fc7ed25133be2cda01d2393268ef49ca1dee9c9dbc67cfa1354bb940a9e651c8236006df0c17f277eb0f821dfe4dbf3584ca6756044a65070c1
+  data.tar.gz: bc3f1e4e3ab39af9f83367a9d72fa622a801978fb840c426464cf7704ec16d9ad81f594efb360ac76c60fc0107801fbd9ce280edcdc242042404d61a11c62175

data/.ruby-version ADDED Viewed

	@@ -0,0 +1 @@
1	+ 2.5.1

data/README.md CHANGED Viewed

@@ -1,8 +1,6 @@
 # Hiroiyomi
-Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/hiroiyomi`. To experiment with that code, run `bin/console` for an interactive prompt.
-TODO: Delete this and the text above, and describe your gem
+Provides features to parse and filter HTML elements.
 ## Installation

data/lib/hiroiyomi/html/attribute.rb CHANGED Viewed

@@ -1,7 +1,6 @@
 # frozen_string_literal: true
-require 'open-uri'
-require 'openssl'
+require 'hiroiyomi/html/dom_parser_helper'
 module Hiroiyomi
   module Html
@@ -9,6 +8,53 @@ module Hiroiyomi
     class Attribute
       attr_accessor :name, :value
+      class << self
+        def value_of(file)
+          name = DOMParserHelper.extract_string(file)
+          return nil if name.empty?
+          value = extract_value(file)
+          Attribute.new(name, value.empty? ? nil : value)
+        end
+        private
+        # name=value
+        #   Check spaces and > whether value is end
+        # name="value"
+        # name='value'
+        def extract_value(file)
+          value = ''
+          open  = { "'" => false, '"' => false }
+          equal = false
+          while (c = file.getc)
+            case c
+            when "'", '"'
+              break if open[c]
+              open_keys = open.keys
+              open_keys.delete(c)
+              if open[open_keys.first]
+                value += c
+              else
+                open[c] = true
+              end
+            else
+              if open.values.any?
+                value += c
+              elsif c == '='
+                equal = true
+              elsif ['>', ' '].include?(c)
+                file.ungetc(c)
+                break
+              elsif equal
+                value += c
+              end
+            end
+          end
+          value
+        end
+      end
       def initialize(name, value = nil)
         @name  = name
         @value = value

data/lib/hiroiyomi/html/childable.rb ADDED Viewed

@@ -0,0 +1,12 @@
+# frozen_string_literal: true
+module Hiroiyomi
+  module Html
+    # Childable
+    module Childable
+      def text?
+        false
+      end
+    end
+  end
+end

data/lib/hiroiyomi/html/document.rb CHANGED Viewed

@@ -1,7 +1,9 @@
 # frozen_string_literal: true
-require 'open-uri'
-require 'openssl'
+require 'hiroiyomi/html/element'
+require 'hiroiyomi/html/attribute'
+require 'hiroiyomi/html/text'
+require 'hiroiyomi/html/dom_parser_helper'
 module Hiroiyomi
   module Html
@@ -11,12 +13,18 @@ module Hiroiyomi
       attr_accessor :root
-      def initialize
-        @root = nil
+      class << self
+        def value_of(file)
+          document = new
+          return document if file.nil?
+          document.root = Element.value_of(file)
+          document
+        end
       end
-      def element=(element)
-        @root = element
+      def initialize
+        @root = nil
       end
       def each

data/lib/hiroiyomi/html/dom_parser.rb ADDED Viewed

@@ -0,0 +1,38 @@
+# frozen_string_literal: true
+require 'hiroiyomi/parser'
+require 'hiroiyomi/html/document'
+module Hiroiyomi
+  module Html
+    # DOMParser
+    class DOMParser
+      include Parser
+      private
+      def do_parse(file)
+        Document.value_of(file)
+      end
+      def do_filter(document, filter:, is_deep: true)
+        filtered_elements = filter_element(document, filter, [])
+        return filtered_elements unless is_deep
+        filtered_elements.map { |e| e.deep_select(filter) }.flatten
+      end
+      def filter_element(element, filter, filtered_elements)
+        element.each do |child|
+          next if child.text?
+          if filter&.include?(child.name.downcase)
+            filtered_elements.push(child)
+          else
+            filter_element(child, filter, filtered_elements)
+          end
+        end
+        filtered_elements
+      end
+    end
+  end
+end

data/lib/hiroiyomi/html/dom_parser_helper.rb ADDED Viewed

@@ -0,0 +1,70 @@
+# frozen_string_literal: true
+module Hiroiyomi
+  module Html
+    # DOMParserHelper
+    class DOMParserHelper
+      class << self
+        def cur_pos(file, c)
+          file.ungetc(c) # In order to get current position correctly
+          cur_pos = file.pos
+          file.getc # drop <
+          cur_pos
+        end
+        def skip_ignore_chars(file)
+          while (c = file.getc)
+            unless /[\\t\\n\\r\s]/.match?(c)
+              file.ungetc(c)
+              return
+            end
+          end
+        end
+        # string of <.+ or ".+"
+        def extract_string(file)
+          skip_ignore_chars(file)
+          string = ''
+          while (c = file.getc)
+            case c
+            when /[\w-]/
+              string += c
+            else
+              file.ungetc(c)
+              break
+            end
+          end
+          string.gsub(/[\t\r\n]/, '').strip
+        end
+        def extract_text_with_symbols(file, char_before_last_char = ']', last_char = '>')
+          string = ''
+          while (c = file.getc)
+            string += c
+            next_c = file.getc
+            if c == char_before_last_char && last_char == next_c
+              string += next_c
+              break
+            end
+            file.ungetc(next_c)
+          end
+          string
+        end
+        # after <!
+        def extract_bang_text(file)
+          cur_pos = file.pos
+          case (c = file.getc)
+          when '[' # CDDATA
+            return "#{c}#{extract_text_with_symbols(file, ']')}"
+          when '-' # Comment
+            extract_text_with_symbols(file, '-')
+            return '' # Drop comments
+          end
+          file.pos = cur_pos
+          nil
+        end
+      end
+    end
+  end
+end

data/lib/hiroiyomi/html/element.rb CHANGED Viewed

@@ -1,25 +1,145 @@
 # frozen_string_literal: true
-require 'open-uri'
-require 'openssl'
+require 'hiroiyomi/html/childable'
+require 'hiroiyomi/html/attribute'
+require 'hiroiyomi/html/text'
 module Hiroiyomi
   module Html
     # Element
     class Element
       include Enumerable
+      include Childable
-      attr_accessor :name, :content, :attributes, :children
+      attr_accessor :name, :parent, :attributes, :children
-      def initialize(name, content: nil, attributes: [], children: [])
+      class << self
+        EXCEPTIONAL_ELEMENT_NAME_LIST = %w[script style].freeze
+        def value_of(file, parent_element = nil)
+          # name
+          name = extract_element_name(file)
+          return parent_element if name.empty?
+          # element
+          element = Element.new(name, parent: parent_element)
+          if parent_element.nil?
+            parent_element = element
+          else
+            parent_element.element = element
+          end
+          # attributes
+          element.attributes = extract_attributes(file)
+          # exceptional elements
+          if EXCEPTIONAL_ELEMENT_NAME_LIST.include?(name.downcase)
+            element.element = extract_exceptional_element_text(file, name)
+            return parent_element
+          end
+          # text if >..., close if /, or open element if >...<
+          Text.add_text_to_element_or_parse(file, element)
+          # close check. move element children to parent element if not closed. e.g. <img ...>
+          element.move_children_to(parent_element) unless validate_closing_element?(element, file)
+          parent_element
+        end
+        private
+        def validate_closing_element?(element, file)
+          open = false
+          while (c = file.getc)
+            # /> or </
+            if c == '/'
+              open = false
+              cur_pos = DOMParserHelper.cur_pos(file, c)
+              next_c = file.getc
+              return true if next_c == '>' # case of />
+              # Check whether name is the same or not
+              file.ungetc(next_c)
+              close_name = DOMParserHelper.extract_string(file)
+              return false if close_name.empty?
+              is_closed = close_name == element.name
+              return true if is_closed
+              # Try it again if name is not matched and next close element name does not exist in parent elements
+              next unless element.parents?(close_name)
+              file.pos = cur_pos
+              return false
+            elsif c == '<' # case of </
+              open = true
+            elsif open
+              file.ungetc(c)
+              return false
+            end
+          end
+          false
+        end
+        # Start from > after attributes
+        def extract_exceptional_element_text(file, name)
+          DOMParserHelper.skip_ignore_chars(file)
+          file.getc # drop >
+          string = ''
+          while (c = file.getc)
+            if c == '<'
+              cur_pos = file.pos
+              if file.getc == '/' && name == DOMParserHelper.extract_string(file)
+                DOMParserHelper.skip_ignore_chars(file)
+                file.getc # drop >
+                break
+              end
+              file.pos = cur_pos
+            end
+            string += c
+          end
+          return Text.new(string) unless string.empty?
+          nil
+        end
+        def extract_element_name(file)
+          while (c = file.getc)
+            next unless c == '<'
+            cur_pos = file.pos
+            if file.getc == '!'
+              # Skip like <!document html>, <!--
+              DOMParserHelper.extract_bang_text(file)
+              next
+            end
+            file.pos = cur_pos
+            return DOMParserHelper.extract_string(file)
+          end
+          ''
+        end
+        def extract_attributes(file)
+          attributes = []
+          while (attribute = Attribute.value_of(file))
+            attributes.push(attribute)
+          end
+          attributes
+        end
+      end
+      def initialize(name, parent: nil, attributes: [], children: [])
         @name       = name
-        @content    = content
+        @parent     = parent
         @attributes = attributes
         @children   = children
       end
       def element=(element)
-        @children.push(element)
+        @children.push(element) unless element.nil?
       end
       def each
@@ -27,6 +147,44 @@ module Hiroiyomi
           yield child
         end
       end
+      def move_children_to(element)
+        each do |child|
+          element.element = child
+        end
+        children.clear
+      end
+      def parents?(name)
+        return false if parent.nil?
+        return true if parent.name == name
+        parent.parents?(name)
+      end
+      def deep_select(search_name_list = [], searched = [])
+        searched.push(self) if search_name_list.include?(name.downcase)
+        children.each do |child|
+          next if child.text?
+          if search_name_list.include?(child.name.downcase)
+            searched.push(child)
+          else
+            child.deep_select(search_name_list, searched)
+          end
+        end
+        searched
+      end
+      def to_s
+        attrs = attributes.map(&:to_s).join(' ')
+        attrs = ' ' + attrs unless attrs.empty?
+        "<#{name}#{attrs}>#{innerHTML}</#{name}>"
+      end
+      private
+      def innerHTML
+        children.map(&:to_s).join
+      end
     end
   end
 end

data/lib/hiroiyomi/html/text.rb ADDED Viewed

@@ -0,0 +1,88 @@
+# frozen_string_literal: true
+require 'hiroiyomi/html/childable'
+require 'hiroiyomi/html/element'
+module Hiroiyomi
+  module Html
+    # Text
+    class Text
+      include Childable
+      attr_accessor :value
+      class << self
+        # Start from > after attributes
+        def add_text_to_element_or_parse(file, element)
+          close = false
+          string = ''
+          append_string = lambda { |str|
+            string += str if close
+          }
+          add_text_to_element = lambda { |str = ''|
+            append_string.call str
+            string = string.gsub(/[\t\r\n]/, '').strip
+            unless string.empty?
+              element.element = new(string)
+              string = ''
+            end
+          }
+          while (c = file.getc)
+            case c
+            when '/' # /* */ ?
+              cur_pos = DOMParserHelper.cur_pos(file, c)
+              next_c = file.getc
+              if next_c == '*'
+                add_text_to_element.call "#{c}#{next_c}#{DOMParserHelper.extract_text_with_symbols(file, next_c, c)}"
+                next
+              end
+              # / is of />
+              file.pos = cur_pos
+              break
+            when '<'
+              cur_pos = DOMParserHelper.cur_pos(file, c)
+              next_c = file.getc
+              if next_c == '!'
+                bang_string = DOMParserHelper.extract_bang_text(file)
+                unless bang_string.nil?
+                  # empty if comment
+                  add_text_to_element.call "#{c}#{next_c}#{bang_string}" unless bang_string.empty?
+                  next
+                end
+              end
+              file.pos = cur_pos
+              add_text_to_element.call
+              # Next element from < char
+              element = Element.value_of(file, element)
+              # file.getc # drop <
+            when '>' # > is of >...
+              close = true
+            else
+              append_string.call c
+            end
+          end
+          add_text_to_element.call
+          element
+        end
+      end
+      def initialize(value)
+        @value = value
+      end
+      def text?
+        true
+      end
+      def to_s
+        value
+      end
+    end
+  end
+end

data/lib/hiroiyomi/parser.rb CHANGED Viewed

@@ -7,22 +7,14 @@ module Hiroiyomi
   # Parser
   module Parser
     def self.included(klass)
-      # @param [String] url URL
-      # @param [Array] filter of filtered by name list, e.g. [h1, h2, h3]
-      #
-      # @return [Array] of Hiroiyomi::Html::Element which has been filtered
-      def klass.read(url, filter:)
-        new.read(url, filter: filter)
+      def klass.read(url, filter:, is_deep: true)
+        new.read(url, filter: filter, is_deep: is_deep)
       end
     end
-    # @param [String] url URL
-    # @param [Array] filter of filtered by name list, e.g. [h1, h2, h3]
-    #
-    # @return [Array] of Hiroiyomi::Html::Element which has been filtered
-    def read(url, filter:)
+    def read(url, filter:, is_deep: true)
       @open_file = open_url(url)
-      do_filter(do_parse(@open_file), filter: filter)
+      do_filter(do_parse(@open_file), filter: filter, is_deep: is_deep)
     ensure
       @open_file&.unlink
     end

data/lib/hiroiyomi/version.rb CHANGED Viewed

@@ -2,5 +2,5 @@
 # Hiroiyomi
 module Hiroiyomi
-  VERSION = '0.1.1'
+  VERSION = '0.1.2'
 end

data/lib/hiroiyomi.rb CHANGED Viewed

@@ -2,16 +2,17 @@
 require 'hiroiyomi/version'
 require 'hiroiyomi/root'
-require 'hiroiyomi/html_parser'
+require 'hiroiyomi/html/dom_parser'
 # Hiroiyomi
 module Hiroiyomi
   # @param [String] url URL
   # @param [Array] filter of filtered by name list, e.g. [h1, h2, h3]
+  # @param [Boolean] is_deep Whether result is filtered into children
   #
   # @return [Array] of Hiroiyomi::Html::Element which has been filtered
-  def read(url, filter: [])
-    HtmlParser.read(url, filter: filter)
+  def read(url, filter: [], is_deep: true)
+    Html::DOMParser.read(url, filter: filter, is_deep: is_deep)
   end
   # rubocop:disable Style/AccessModifierDeclarations

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: hiroiyomi
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.1.2
 platform: ruby
 authors:
 - Tomonori Murakami
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-07-12 00:00:00.000000000 Z
+date: 2018-07-14 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -76,6 +76,7 @@ files:
 - ".gitignore"
 - ".rspec"
 - ".rubocop.yml"
+- ".ruby-version"
 - ".travis.yml"
 - Gemfile
 - README.md
@@ -85,9 +86,12 @@ files:
 - hiroiyomi.gemspec
 - lib/hiroiyomi.rb
 - lib/hiroiyomi/html/attribute.rb
+- lib/hiroiyomi/html/childable.rb
 - lib/hiroiyomi/html/document.rb
+- lib/hiroiyomi/html/dom_parser.rb
+- lib/hiroiyomi/html/dom_parser_helper.rb
 - lib/hiroiyomi/html/element.rb
-- lib/hiroiyomi/html_parser.rb
+- lib/hiroiyomi/html/text.rb
 - lib/hiroiyomi/parser.rb
 - lib/hiroiyomi/root.rb
 - lib/hiroiyomi/version.rb

data/lib/hiroiyomi/html_parser.rb DELETED Viewed

@@ -1,191 +0,0 @@
-# frozen_string_literal: true
-require 'hiroiyomi/parser'
-require 'hiroiyomi/html/document'
-require 'hiroiyomi/html/element'
-require 'hiroiyomi/html/attribute'
-module Hiroiyomi
-  # HtmlParser
-  # rubocop:disable Metrics/ClassLength
-  class HtmlParser
-    include Parser
-    private
-    def do_parse(file)
-      document = Html::Document.new
-      return document if file.nil?
-      track_element(file, document)
-    end
-    # ========
-    # Extract HTML Element
-    # ========
-    def track_element(file, document)
-      while (c = file.getc)
-        break if c == '<' && extract_element(file, document)
-      end
-      document
-    end
-    def extract_element(file, document)
-      name = extract_name(file)
-      return false if name.empty?
-      attributes       = extract_attributes(file)
-      element          = Html::Element.new(name, attributes: attributes)
-      content          = extract_content(file, element)
-      element.content  = content unless content.empty?
-      document.element = element if validate_closing_element?(name, file)
-      true
-    end
-    # rubocop:disable Metrics/MethodLength
-    def extract_name(file, skip_space: false)
-      name = ''
-      while (c = file.getc)
-        case c
-        when /[\w-]/
-          name += c
-        else
-          next if skip_space && c =~ /\s/
-          file.ungetc(c)
-          break
-        end
-      end
-      name
-    end
-    # rubocop:enable Metrics/MethodLength
-    def extract_attributes(file)
-      attributes = []
-      while (attribute = extract_attribute(file))
-        attributes.push(attribute)
-      end
-      attributes
-    end
-    # rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity
-    def extract_attribute(file)
-      name = extract_name(file, skip_space: true)
-      return nil if name.empty?
-      value = ''
-      open  = false
-      while (c = file.getc)
-        case c
-        when '"'
-          break if open
-          open = true
-        else
-          value += c if open
-        end
-      end
-      Html::Attribute.new(name, value.empty? ? nil : value)
-    end
-    # rubocop:enable Metrics/MethodLength, Metrics/CyclomaticComplexity
-    # rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/AbcSize
-    def extract_content(file, document)
-      content = ''
-      close   = false
-      append_content = lambda { |str|
-        content += str if close
-      }
-      while (c = file.getc)
-        case c
-        when '/'
-          # /*<![CDATA[*/!function(e,t,r){function ... ])/*]]>*/
-          next_c = file.getc
-          if next_c == '*'
-            append_content.call(c + next_c)
-            content += extract_content_of_cddata(file)
-          elsif !close
-            file.ungetc(c)
-            break
-          end
-        when '<'
-          extract_element(file, document)
-          # file.ungetc(c)
-          # track_element(file, document)
-          close = false
-        when '>'
-          close ||= true
-        else
-          append_content.call(c)
-        end
-      end
-      content
-    end
-    # /*<![CDATA[*/!function(e,t,r){function ... ])/*]]>*/
-    def extract_content_of_cddata(file)
-      content      = ''
-      start_cddata = false
-      append_content = lambda { |str|
-        content += str
-      }
-      while (c = file.getc)
-        case c
-        when '/'
-          next_c = file.getc
-          append_content.call(c + next_c) if next_c == '*'
-        when '*' # /*<![CDATA[*/!function(e,t,r){function ... ])/*]]>*/
-          next_c = file.getc
-          unless next_c == '/'
-            file.ungetc(next_c)
-            next_c = ''
-          end
-          start_cddata = !start_cddata
-          append_content.call(c + next_c)
-          return content unless start_cddata
-        else
-          append_content.call(c)
-        end
-      end
-      content
-    end
-    # rubocop:enable Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/AbcSize
-    def validate_closing_element?(element_name, file)
-      open = false
-      while (c = file.getc)
-        return !open || extract_name(file) == element_name if c == '/'
-        open = true if c == '<'
-      end
-      false
-    end
-    # ========
-    # Filter HTML Element
-    # ========
-    def do_filter(document, filter:)
-      filter_element(document, filter, [])
-    end
-    def filter_element(element, filter, extracted_elements)
-      element.each do |child|
-        if filter&.include?(child.name)
-          extracted_elements.push(child)
-        else
-          filter_element(child, filter, extracted_elements)
-        end
-      end
-      extracted_elements
-    end
-  end
-  # rubocop:enable Metrics/ClassLength
-end