RubyGems - article_json - Versions diffs - 0.1.0 - Mend

article_json 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +12 -0
data/LICENSE +21 -0
data/README.md +78 -0
data/bin/article_json_export_google_doc.rb +22 -0
data/bin/article_json_export_html.rb +14 -0
data/bin/article_json_parse_google_doc.rb +14 -0
data/bin/update_reference_document.sh +18 -0
data/lib/article_json/article.rb +53 -0
data/lib/article_json/configuration.rb +24 -0
data/lib/article_json/elements/base.rb +40 -0
data/lib/article_json/elements/embed.rb +58 -0
data/lib/article_json/elements/heading.rb +37 -0
data/lib/article_json/elements/image.rb +41 -0
data/lib/article_json/elements/list.rb +37 -0
data/lib/article_json/elements/paragraph.rb +31 -0
data/lib/article_json/elements/quote.rb +41 -0
data/lib/article_json/elements/text.rb +45 -0
data/lib/article_json/elements/text_box.rb +37 -0
data/lib/article_json/export/html/elements/base.rb +59 -0
data/lib/article_json/export/html/elements/embed.rb +28 -0
data/lib/article_json/export/html/elements/heading.rb +19 -0
data/lib/article_json/export/html/elements/image.rb +33 -0
data/lib/article_json/export/html/elements/list.rb +25 -0
data/lib/article_json/export/html/elements/paragraph.rb +17 -0
data/lib/article_json/export/html/elements/quote.rb +29 -0
data/lib/article_json/export/html/elements/shared/caption.rb +22 -0
data/lib/article_json/export/html/elements/shared/float.rb +17 -0
data/lib/article_json/export/html/elements/text.rb +44 -0
data/lib/article_json/export/html/elements/text_box.rb +25 -0
data/lib/article_json/export/html/exporter.rb +22 -0
data/lib/article_json/import/google_doc/html/css_analyzer.rb +144 -0
data/lib/article_json/import/google_doc/html/embedded_facebook_video_parser.rb +33 -0
data/lib/article_json/import/google_doc/html/embedded_parser.rb +113 -0
data/lib/article_json/import/google_doc/html/embedded_slideshare_parser.rb +36 -0
data/lib/article_json/import/google_doc/html/embedded_tweet_parser.rb +37 -0
data/lib/article_json/import/google_doc/html/embedded_vimeo_video_parser.rb +29 -0
data/lib/article_json/import/google_doc/html/embedded_youtube_video_parser.rb +33 -0
data/lib/article_json/import/google_doc/html/heading_parser.rb +38 -0
data/lib/article_json/import/google_doc/html/image_parser.rb +75 -0
data/lib/article_json/import/google_doc/html/list_parser.rb +46 -0
data/lib/article_json/import/google_doc/html/node_analyzer.rb +111 -0
data/lib/article_json/import/google_doc/html/paragraph_parser.rb +26 -0
data/lib/article_json/import/google_doc/html/parser.rb +125 -0
data/lib/article_json/import/google_doc/html/quote_parser.rb +46 -0
data/lib/article_json/import/google_doc/html/shared/caption.rb +20 -0
data/lib/article_json/import/google_doc/html/shared/float.rb +21 -0
data/lib/article_json/import/google_doc/html/text_box_parser.rb +49 -0
data/lib/article_json/import/google_doc/html/text_parser.rb +89 -0
data/lib/article_json/utils/o_embed_resolver/base.rb +63 -0
data/lib/article_json/utils/o_embed_resolver/facebook_video.rb +21 -0
data/lib/article_json/utils/o_embed_resolver/slideshare.rb +22 -0
data/lib/article_json/utils/o_embed_resolver/tweet.rb +23 -0
data/lib/article_json/utils/o_embed_resolver/vimeo_video.rb +21 -0
data/lib/article_json/utils/o_embed_resolver/youtube_video.rb +21 -0
data/lib/article_json/utils.rb +11 -0
data/lib/article_json/version.rb +3 -0
data/lib/article_json.rb +55 -0
metadata +189 -0

data/lib/article_json/import/google_doc/html/embedded_tweet_parser.rb ADDED Viewed

@@ -0,0 +1,37 @@
+module ArticleJSON
+  module Import
+    module GoogleDoc
+      module HTML
+        class EmbeddedTweetParser < EmbeddedParser
+          # The type of this embedded element
+          # @return [Symbol]
+          def embed_type
+            :tweet
+          end
+          # Extract the tweet ID (including the handle) from an URL
+          # @return [String]
+          def embed_id
+            match = @node.inner_text.strip.match(self.class.url_regexp)
+            "#{match[:handle]}/#{match[:id]}" if match
+          end
+          class << self
+            # Regular expression to check if a given string is a Twitter URL
+            # Also used to extract the ID from the URL.
+            # @return [Regexp]
+            def url_regexp
+              %r{
+                ^\S*                        # all protocols & sub domains
+                twitter\.com/               # domain
+                (?<handle>[^#/]+)           # twitter handle
+                (?:\#|/status/|/statuses/)  # optional path or hash char
+                (?<id>\d+)                  # numeric tweet id
+              }xi
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/article_json/import/google_doc/html/embedded_vimeo_video_parser.rb ADDED Viewed

@@ -0,0 +1,29 @@
+module ArticleJSON
+  module Import
+    module GoogleDoc
+      module HTML
+        class EmbeddedVimeoVideoParser < EmbeddedParser
+          # The type of this embedded element
+          # @return [Symbol]
+          def embed_type
+            :vimeo_video
+          end
+          class << self
+            # Regular expression to check if a given string is a Vimeo URL
+            # Can also be used to extract the ID from the URL
+            # @return [Regexp]
+            def url_regexp
+              %r{
+                ^\S*           # all protocols & sub domains
+                vimeo\.com     # domain
+                .*[\#/]        # optional path
+                (?<id>[\d]+)   # numerical id
+              }xi
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/article_json/import/google_doc/html/embedded_youtube_video_parser.rb ADDED Viewed

@@ -0,0 +1,33 @@
+module ArticleJSON
+  module Import
+    module GoogleDoc
+      module HTML
+        class EmbeddedYoutubeVideoParser < EmbeddedParser
+          # The type of this embedded element
+          # @return [Symbol]
+          def embed_type
+            :youtube_video
+          end
+          class << self
+            # Regular expression to check if a given string is a Youtube URL
+            # Also used to extract the ID from the URL.
+            # @return [Regexp]
+            def url_regexp
+              %r{
+                ^\S*                   # all protocols & sub domains
+                (                      # different domains / paths
+                  youtube\.com/(
+                    [^/]+/.+/|(v|e(mbed)?)/|.*[?&]v=
+                  )|
+                  youtu\.be/
+                )
+                (?<id>[a-zA-Z0-9_-]+)  # alpha-numerical id, including _-
+              }xi
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/article_json/import/google_doc/html/heading_parser.rb ADDED Viewed

@@ -0,0 +1,38 @@
+module ArticleJSON
+  module Import
+    module GoogleDoc
+      module HTML
+        class HeadingParser
+          # @param [Nokogiri::HTML::Node] node
+          def initialize(node:)
+            @node = node
+          end
+          # The raw text content of the heading, without any markup
+          # @return [String]
+          def content
+            @node.inner_text
+          end
+          # Determine the level of the heading
+          # The level corresponds to the header tag, e.g. `<h3>` is level 3.
+          # @return [Integer]
+          def level
+            case @node.name
+              when 'h1' then 1
+              when 'h2' then 2
+              when 'h3' then 3
+              when 'h4' then 4
+              when 'h5' then 5
+            end
+          end
+          # @return [ArticleJSON::Elements::Heading]
+          def element
+            ArticleJSON::Elements::Heading.new(level: level, content: content)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/article_json/import/google_doc/html/image_parser.rb ADDED Viewed

@@ -0,0 +1,75 @@
+module ArticleJSON
+  module Import
+    module GoogleDoc
+      module HTML
+        class ImageParser
+          include Shared::Caption
+          include Shared::Float
+          # @param [Nokogiri::HTML::Node] node
+          # @param [Nokogiri::HTML::Node] caption_node
+          # @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer
+          def initialize(node:, caption_node:, css_analyzer:)
+            @node = node
+            @caption_node = caption_node
+            @css_analyzer = css_analyzer
+            # Main node indicates the floating behavior
+            @float_node = @node
+          end
+          # The value of the image's `src` attribute
+          # @return [String]
+          def source_url
+            image_node.attribute('src').value
+          end
+          # The node of the actual image
+          # @return [Nokogiri::HTML::Node]
+          def image_node
+            @node.xpath('.//img').first
+          end
+          # Check if the image is floating (left, right or not at all)
+          # @return [Symbol]
+          def float
+            super if floatable_size?
+          end
+          # @return [ArticleJSON::Elements::Image]
+          def element
+            ArticleJSON::Elements::Image.new(
+              source_url: source_url,
+              float: float,
+              caption: caption
+            )
+          end
+          private
+          # Check if the image's width can be determined and is less than 500px
+          # This is about 3/4 of the google document width...
+          # @return [Boolean]
+          def floatable_size?
+            image_width && image_width < 500
+          end
+          # Get the specified width of the image if available
+          # The width can either be specified in a width attribute or via style
+          # attribute. If not, `nil` is returned.
+          # @return [Integer]
+          def image_width
+            @image_width ||=
+              if image_node.has_attribute?('width')
+                image_node.attribute('width').value.to_i
+              elsif image_node.has_attribute?('style')
+                regex = /width:\s?(?<px>\d+|(\d+?\.\d+))px/
+                match = image_node.attribute('style').value.match(regex)
+                match['px'].to_i if match && match['px']
+              end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/article_json/import/google_doc/html/list_parser.rb ADDED Viewed

@@ -0,0 +1,46 @@
+module ArticleJSON
+  module Import
+    module GoogleDoc
+      module HTML
+        class ListParser
+          # @param [Nokogiri::HTML::Node] node
+          # @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer
+          def initialize(node:, css_analyzer:)
+            @node = node
+            @css_analyzer = css_analyzer
+          end
+          # Determine the list type, either ordered or unordered
+          # @return [Symbol]
+          def list_type
+            case @node.name
+              when 'ol' then :ordered
+              when 'ul' then :unordered
+            end
+          end
+          # Parse the list's sub nodes to get a set of paragraphs
+          # @return [Array[ArticleJSON::Elements::Paragraph]]
+          def content
+            @node
+              .children
+              .select { |node| node.name == 'li' }
+              .map do |node|
+                ParagraphParser
+                  .new(node: node, css_analyzer: @css_analyzer)
+                  .element
+              end
+          end
+          # @return [ArticleJSON::Elements::List]
+          def element
+            ArticleJSON::Elements::List.new(
+              list_type: list_type,
+              content: content
+            )
+          end
+        end
+      end
+    end
+  end
+end

data/lib/article_json/import/google_doc/html/node_analyzer.rb ADDED Viewed

@@ -0,0 +1,111 @@
+module ArticleJSON
+  module Import
+    module GoogleDoc
+      module HTML
+        class NodeAnalyzer
+          attr_reader :node
+          # @param [Nokogiri::HTML::Node] node
+          def initialize(node)
+            @node = node
+          end
+          # Check if a node equals a certain text
+          # @param [String] text
+          # @return [Boolean]
+          def has_text?(text)
+            node.inner_text.strip.downcase == text.strip.downcase
+          end
+          # Check if the node is empty, i.e. not containing any text
+          # Given that images are the only nodes without text, we have to make
+          # sure that it's not an image.
+          # @return [Boolean]
+          def empty?
+            return @is_empty if defined? @is_empty
+            @is_empty = node.inner_text.strip.empty? && !image? && !hr?
+          end
+          # Check if the node is a header tag between <h1> and <h5>
+          # @return [Boolean]
+          def heading?
+            return @is_heading if defined? @is_heading
+            @is_heading = %w(h1 h2 h3 h4 h5).include?(node.name)
+          end
+          # Check if the node is a horizontal line (i.e. `<hr>`)
+          # @return [Boolean]
+          def hr?
+            node.name == 'hr'
+          end
+          # Check if the node is a normal text paragraph
+          # @return [Boolean]
+          def paragraph?
+            return @is_paragraph if defined? @is_paragraph
+            @is_paragraph =
+              node.name == 'p' &&
+                !empty? &&
+                !image? &&
+                !text_box? &&
+                !quote? &&
+                !embed?
+          end
+          # Check if the node contains an ordered or unordered list
+          # @return [Boolean]
+          def list?
+            return @is_list if defined? @is_list
+            @is_list = %w(ul ol).include?(node.name)
+          end
+          # Check if the node starts a text box
+          # Text boxes start with a single line saying "Textbox:" or "Highlight:".
+          # @return [Boolean]
+          def text_box?
+            return @is_text_box if defined? @is_text_box
+            @is_text_box = has_text?('textbox:') || has_text?('highlight:')
+          end
+          # Check if the node starts a quote
+          # Quotes start with a single line saying "Quote:".
+          # @return [Boolean]
+          def quote?
+            return @is_quote if defined? @is_quote
+            @is_quote = has_text?('quote:')
+          end
+          # Check if the node contains an image
+          # @return [Boolean]
+          def image?
+            return @is_image if defined? @is_image
+            @is_image = node.xpath('.//img').length > 0
+          end
+          # Check if the node contains an embedded element
+          # @return [Boolean]
+          def embed?
+            return @is_embed if defined? @is_embed
+            @is_embed = EmbeddedParser.supported?(node)
+          end
+          # Determine the type of this node
+          # The type is one of the elements supported by article_json.
+          # @return [Symbol]
+          def type
+            return :empty if empty?
+            return :hr if hr?
+            return :heading if heading?
+            return :paragraph if paragraph?
+            return :list if list?
+            return :text_box if text_box?
+            return :quote if quote?
+            return :image if image?
+            return :embed if embed?
+            :unknown
+          end
+        end
+      end
+    end
+  end
+end

data/lib/article_json/import/google_doc/html/paragraph_parser.rb ADDED Viewed

@@ -0,0 +1,26 @@
+module ArticleJSON
+  module Import
+    module GoogleDoc
+      module HTML
+        class ParagraphParser
+          # @param [Nokogiri::HTML::Node] node
+          # @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer
+          def initialize(node:, css_analyzer:)
+            @node = node
+            @css_analyzer = css_analyzer
+          end
+          # @return [Array[ArticleJSON::Elements::Text]]
+          def content
+            TextParser.extract(node: @node, css_analyzer: @css_analyzer)
+          end
+          # @return [ArticleJSON::Elements::Paragraph]
+          def element
+            ArticleJSON::Elements::Paragraph.new(content: content)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/article_json/import/google_doc/html/parser.rb ADDED Viewed

@@ -0,0 +1,125 @@
+module ArticleJSON
+  module Import
+    module GoogleDoc
+      module HTML
+        class Parser
+          # @param [String] html
+          def initialize(html)
+            doc = Nokogiri::HTML(html)
+            @body_enumerator = doc.xpath('//body').last.children.to_enum
+            css_node = doc.xpath('//head/style').last
+            @css_analyzer = CSSAnalyzer.new(css_node&.inner_text)
+          end
+          # Parse the body of the document and return the result
+          # @return [Array[ArticleJSON::Elements::Base]]
+          def parsed_content
+            @parsed_content ||= parse_body
+          end
+          private
+          # Loop over all body nodes and parse them
+          # @return [Array[ArticleJSON::Elements::Base]]
+          def parse_body
+            @parsed_content = []
+            while body_has_more_nodes?
+              @parsed_content << begin
+                @current_node = NodeAnalyzer.new(@body_enumerator.next)
+                parse_current_node || next
+              end
+            end
+            @parsed_content
+          end
+          # Parse the current node and return an element, if available
+          # @return [ArticleJSON::Elements::Base]
+          def parse_current_node
+            case @current_node.type
+            when :heading then parse_heading
+            when :paragraph then parse_paragraph
+            when :list then parse_list
+            when :image then parse_image
+            when :text_box then parse_text_box
+            when :quote then parse_quote
+            when :embed then parse_embed
+            when :hr, :empty, :unknown then nil
+            end
+          end
+          # @return [ArticleJSON::Elements::Heading]
+          def parse_heading
+            HeadingParser.new(node: @current_node.node).element
+          end
+          # @return [ArticleJSON::Elements::Paragraph]
+          def parse_paragraph
+            ParagraphParser
+              .new(node: @current_node.node, css_analyzer: @css_analyzer)
+              .element
+          end
+          # @return [ArticleJSON::Elements::List]
+          def parse_list
+            ListParser
+              .new(node: @current_node.node, css_analyzer: @css_analyzer)
+              .element
+          end
+          # @return [ArticleJSON::Elements::Image]
+          def parse_image
+            ImageParser
+              .new(
+                node: @current_node.node,
+                caption_node: @body_enumerator.next,
+                css_analyzer: @css_analyzer
+              )
+              .element
+          end
+          # @return [ArticleJSON::Elements::TextBox]
+          def parse_text_box
+            TextBoxParser
+              .new(nodes: nodes_until_hr, css_analyzer: @css_analyzer)
+              .element
+          end
+          # @return [ArticleJSON::Elements::Quote]
+          def parse_quote
+            QuoteParser
+              .new(nodes: nodes_until_hr, css_analyzer: @css_analyzer)
+              .element
+          end
+          # @return [ArticleJSON::Elements::Embed]
+          def parse_embed
+            EmbeddedParser.build(
+              node: @current_node.node,
+              caption_node: @body_enumerator.next,
+              css_analyzer: @css_analyzer
+            )
+          end
+          # Collect all nodes until a horizontal line, advancing the enumerator
+          # @return [Array[Nokogiri::HTML::Node]]
+          def nodes_until_hr
+            nodes = []
+            until NodeAnalyzer.new(@body_enumerator.peek).hr?
+              nodes << @body_enumerator.next
+            end
+            nodes
+          end
+          # @return [Boolean]
+          def body_has_more_nodes?
+            @body_enumerator.peek
+            true
+          rescue StopIteration
+            false
+          end
+        end
+      end
+    end
+  end
+end

data/lib/article_json/import/google_doc/html/quote_parser.rb ADDED Viewed

@@ -0,0 +1,46 @@
+module ArticleJSON
+  module Import
+    module GoogleDoc
+      module HTML
+        class QuoteParser
+          include Shared::Caption
+          include Shared::Float
+          # @param [Array[Nokogiri::HTML::Node]] nodes
+          # @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer
+          def initialize(nodes:, css_analyzer:)
+            @nodes = nodes.reject { |node| NodeAnalyzer.new(node).empty? }
+            @css_analyzer = css_analyzer
+            # First node of the quote indicates floating behavior
+            @float_node = @nodes.first
+            # Last node of the quote contains the caption
+            @caption_node = @nodes.last
+          end
+          # Parse the quote's nodes to get a set of paragraphs
+          # The last node is ignored as it contains the quote caption
+          # @return [Array[ArticleJSON::Elements::Paragraph]]
+          def content
+            @nodes
+              .take(@nodes.size - 1)
+              .map do |node|
+                ParagraphParser
+                  .new(node: node, css_analyzer: @css_analyzer)
+                  .element
+              end
+          end
+          # @return [ArticleJSON::Elements::Quote]
+          def element
+            ArticleJSON::Elements::Quote.new(
+              content: content,
+              caption: caption,
+              float: float
+            )
+          end
+        end
+      end
+    end
+  end
+end

data/lib/article_json/import/google_doc/html/shared/caption.rb ADDED Viewed

@@ -0,0 +1,20 @@
+module ArticleJSON
+  module Import
+    module GoogleDoc
+      module HTML
+        module Shared
+          module Caption
+            # Parse the caption node
+            # @return [Array[ArticleJSON::Elements::Text]]
+            def caption
+              ArticleJSON::Import::GoogleDoc::HTML::TextParser.extract(
+                node: @caption_node,
+                css_analyzer: @css_analyzer
+              )
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/article_json/import/google_doc/html/shared/float.rb ADDED Viewed

@@ -0,0 +1,21 @@
+module ArticleJSON
+  module Import
+    module GoogleDoc
+      module HTML
+        module Shared
+          module Float
+            # Check if the quote is floating (left, right or not at all)
+            # @return [Symbol]
+            def float
+              return unless @float_node.has_attribute?('class')
+              node_class = @float_node.attribute('class').value || ''
+              return :right if @css_analyzer.right_aligned?(node_class)
+              return :left if @css_analyzer.left_aligned?(node_class)
+              nil
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/article_json/import/google_doc/html/text_box_parser.rb ADDED Viewed

@@ -0,0 +1,49 @@
+module ArticleJSON
+  module Import
+    module GoogleDoc
+      module HTML
+        class TextBoxParser
+          include Shared::Float
+          # @param [Array[Nokogiri::HTML::Node]] nodes
+          # @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer
+          def initialize(nodes:, css_analyzer:)
+            @nodes = nodes.reject { |node| NodeAnalyzer.new(node).empty? }
+            @css_analyzer = css_analyzer
+            # First node of the text box indicates floating behavior
+            @float_node = @nodes.first
+          end
+          # Parse the text box's nodes to get a list of sub elements
+          # Supported sub elements are: headings, paragraphs & lists.
+          # @return [Array]
+          def content
+            @nodes.map { |node| parse_sub_node(node) }.compact
+          end
+          # Hash representation of this text box
+          # @return [ArticleJSON::Elements::TextBox]
+          def element
+            ArticleJSON::Elements::TextBox.new(float: float, content: content)
+          end
+          private
+          def parse_sub_node(node)
+            case NodeAnalyzer.new(node).type
+            when :heading
+              HeadingParser.new(node: node).element
+            when :paragraph
+              ParagraphParser
+                .new(node: node, css_analyzer: @css_analyzer)
+                .element
+            when :list
+              ListParser.new(node: node, css_analyzer: @css_analyzer).element
+            end
+          end
+        end
+      end
+    end
+  end
+end