article_json 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
 - data/CHANGELOG.md +12 -0
 - data/LICENSE +21 -0
 - data/README.md +78 -0
 - data/bin/article_json_export_google_doc.rb +22 -0
 - data/bin/article_json_export_html.rb +14 -0
 - data/bin/article_json_parse_google_doc.rb +14 -0
 - data/bin/update_reference_document.sh +18 -0
 - data/lib/article_json/article.rb +53 -0
 - data/lib/article_json/configuration.rb +24 -0
 - data/lib/article_json/elements/base.rb +40 -0
 - data/lib/article_json/elements/embed.rb +58 -0
 - data/lib/article_json/elements/heading.rb +37 -0
 - data/lib/article_json/elements/image.rb +41 -0
 - data/lib/article_json/elements/list.rb +37 -0
 - data/lib/article_json/elements/paragraph.rb +31 -0
 - data/lib/article_json/elements/quote.rb +41 -0
 - data/lib/article_json/elements/text.rb +45 -0
 - data/lib/article_json/elements/text_box.rb +37 -0
 - data/lib/article_json/export/html/elements/base.rb +59 -0
 - data/lib/article_json/export/html/elements/embed.rb +28 -0
 - data/lib/article_json/export/html/elements/heading.rb +19 -0
 - data/lib/article_json/export/html/elements/image.rb +33 -0
 - data/lib/article_json/export/html/elements/list.rb +25 -0
 - data/lib/article_json/export/html/elements/paragraph.rb +17 -0
 - data/lib/article_json/export/html/elements/quote.rb +29 -0
 - data/lib/article_json/export/html/elements/shared/caption.rb +22 -0
 - data/lib/article_json/export/html/elements/shared/float.rb +17 -0
 - data/lib/article_json/export/html/elements/text.rb +44 -0
 - data/lib/article_json/export/html/elements/text_box.rb +25 -0
 - data/lib/article_json/export/html/exporter.rb +22 -0
 - data/lib/article_json/import/google_doc/html/css_analyzer.rb +144 -0
 - data/lib/article_json/import/google_doc/html/embedded_facebook_video_parser.rb +33 -0
 - data/lib/article_json/import/google_doc/html/embedded_parser.rb +113 -0
 - data/lib/article_json/import/google_doc/html/embedded_slideshare_parser.rb +36 -0
 - data/lib/article_json/import/google_doc/html/embedded_tweet_parser.rb +37 -0
 - data/lib/article_json/import/google_doc/html/embedded_vimeo_video_parser.rb +29 -0
 - data/lib/article_json/import/google_doc/html/embedded_youtube_video_parser.rb +33 -0
 - data/lib/article_json/import/google_doc/html/heading_parser.rb +38 -0
 - data/lib/article_json/import/google_doc/html/image_parser.rb +75 -0
 - data/lib/article_json/import/google_doc/html/list_parser.rb +46 -0
 - data/lib/article_json/import/google_doc/html/node_analyzer.rb +111 -0
 - data/lib/article_json/import/google_doc/html/paragraph_parser.rb +26 -0
 - data/lib/article_json/import/google_doc/html/parser.rb +125 -0
 - data/lib/article_json/import/google_doc/html/quote_parser.rb +46 -0
 - data/lib/article_json/import/google_doc/html/shared/caption.rb +20 -0
 - data/lib/article_json/import/google_doc/html/shared/float.rb +21 -0
 - data/lib/article_json/import/google_doc/html/text_box_parser.rb +49 -0
 - data/lib/article_json/import/google_doc/html/text_parser.rb +89 -0
 - data/lib/article_json/utils/o_embed_resolver/base.rb +63 -0
 - data/lib/article_json/utils/o_embed_resolver/facebook_video.rb +21 -0
 - data/lib/article_json/utils/o_embed_resolver/slideshare.rb +22 -0
 - data/lib/article_json/utils/o_embed_resolver/tweet.rb +23 -0
 - data/lib/article_json/utils/o_embed_resolver/vimeo_video.rb +21 -0
 - data/lib/article_json/utils/o_embed_resolver/youtube_video.rb +21 -0
 - data/lib/article_json/utils.rb +11 -0
 - data/lib/article_json/version.rb +3 -0
 - data/lib/article_json.rb +55 -0
 - metadata +189 -0
 
| 
         @@ -0,0 +1,37 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module ArticleJSON
         
     | 
| 
      
 2 
     | 
    
         
            +
              module Import
         
     | 
| 
      
 3 
     | 
    
         
            +
                module GoogleDoc
         
     | 
| 
      
 4 
     | 
    
         
            +
                  module HTML
         
     | 
| 
      
 5 
     | 
    
         
            +
                    class EmbeddedTweetParser < EmbeddedParser
         
     | 
| 
      
 6 
     | 
    
         
            +
                      # The type of this embedded element
         
     | 
| 
      
 7 
     | 
    
         
            +
                      # @return [Symbol]
         
     | 
| 
      
 8 
     | 
    
         
            +
                      def embed_type
         
     | 
| 
      
 9 
     | 
    
         
            +
                        :tweet
         
     | 
| 
      
 10 
     | 
    
         
            +
                      end
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
                      # Extract the tweet ID (including the handle) from an URL
         
     | 
| 
      
 13 
     | 
    
         
            +
                      # @return [String]
         
     | 
| 
      
 14 
     | 
    
         
            +
                      def embed_id
         
     | 
| 
      
 15 
     | 
    
         
            +
                        match = @node.inner_text.strip.match(self.class.url_regexp)
         
     | 
| 
      
 16 
     | 
    
         
            +
                        "#{match[:handle]}/#{match[:id]}" if match
         
     | 
| 
      
 17 
     | 
    
         
            +
                      end
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
                      class << self
         
     | 
| 
      
 20 
     | 
    
         
            +
                        # Regular expression to check if a given string is a Twitter URL
         
     | 
| 
      
 21 
     | 
    
         
            +
                        # Also used to extract the ID from the URL.
         
     | 
| 
      
 22 
     | 
    
         
            +
                        # @return [Regexp]
         
     | 
| 
      
 23 
     | 
    
         
            +
                        def url_regexp
         
     | 
| 
      
 24 
     | 
    
         
            +
                          %r{
         
     | 
| 
      
 25 
     | 
    
         
            +
                            ^\S*                        # all protocols & sub domains
         
     | 
| 
      
 26 
     | 
    
         
            +
                            twitter\.com/               # domain
         
     | 
| 
      
 27 
     | 
    
         
            +
                            (?<handle>[^#/]+)           # twitter handle
         
     | 
| 
      
 28 
     | 
    
         
            +
                            (?:\#|/status/|/statuses/)  # optional path or hash char
         
     | 
| 
      
 29 
     | 
    
         
            +
                            (?<id>\d+)                  # numeric tweet id
         
     | 
| 
      
 30 
     | 
    
         
            +
                          }xi
         
     | 
| 
      
 31 
     | 
    
         
            +
                        end
         
     | 
| 
      
 32 
     | 
    
         
            +
                      end
         
     | 
| 
      
 33 
     | 
    
         
            +
                    end
         
     | 
| 
      
 34 
     | 
    
         
            +
                  end
         
     | 
| 
      
 35 
     | 
    
         
            +
                end
         
     | 
| 
      
 36 
     | 
    
         
            +
              end
         
     | 
| 
      
 37 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,29 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module ArticleJSON
         
     | 
| 
      
 2 
     | 
    
         
            +
              module Import
         
     | 
| 
      
 3 
     | 
    
         
            +
                module GoogleDoc
         
     | 
| 
      
 4 
     | 
    
         
            +
                  module HTML
         
     | 
| 
      
 5 
     | 
    
         
            +
                    class EmbeddedVimeoVideoParser < EmbeddedParser
         
     | 
| 
      
 6 
     | 
    
         
            +
                      # The type of this embedded element
         
     | 
| 
      
 7 
     | 
    
         
            +
                      # @return [Symbol]
         
     | 
| 
      
 8 
     | 
    
         
            +
                      def embed_type
         
     | 
| 
      
 9 
     | 
    
         
            +
                        :vimeo_video
         
     | 
| 
      
 10 
     | 
    
         
            +
                      end
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
                      class << self
         
     | 
| 
      
 13 
     | 
    
         
            +
                        # Regular expression to check if a given string is a Vimeo URL
         
     | 
| 
      
 14 
     | 
    
         
            +
                        # Can also be used to extract the ID from the URL
         
     | 
| 
      
 15 
     | 
    
         
            +
                        # @return [Regexp]
         
     | 
| 
      
 16 
     | 
    
         
            +
                        def url_regexp
         
     | 
| 
      
 17 
     | 
    
         
            +
                          %r{
         
     | 
| 
      
 18 
     | 
    
         
            +
                            ^\S*           # all protocols & sub domains
         
     | 
| 
      
 19 
     | 
    
         
            +
                            vimeo\.com     # domain
         
     | 
| 
      
 20 
     | 
    
         
            +
                            .*[\#/]        # optional path
         
     | 
| 
      
 21 
     | 
    
         
            +
                            (?<id>[\d]+)   # numerical id
         
     | 
| 
      
 22 
     | 
    
         
            +
                          }xi
         
     | 
| 
      
 23 
     | 
    
         
            +
                        end
         
     | 
| 
      
 24 
     | 
    
         
            +
                      end
         
     | 
| 
      
 25 
     | 
    
         
            +
                    end
         
     | 
| 
      
 26 
     | 
    
         
            +
                  end
         
     | 
| 
      
 27 
     | 
    
         
            +
                end
         
     | 
| 
      
 28 
     | 
    
         
            +
              end
         
     | 
| 
      
 29 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,33 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module ArticleJSON
         
     | 
| 
      
 2 
     | 
    
         
            +
              module Import
         
     | 
| 
      
 3 
     | 
    
         
            +
                module GoogleDoc
         
     | 
| 
      
 4 
     | 
    
         
            +
                  module HTML
         
     | 
| 
      
 5 
     | 
    
         
            +
                    class EmbeddedYoutubeVideoParser < EmbeddedParser
         
     | 
| 
      
 6 
     | 
    
         
            +
                      # The type of this embedded element
         
     | 
| 
      
 7 
     | 
    
         
            +
                      # @return [Symbol]
         
     | 
| 
      
 8 
     | 
    
         
            +
                      def embed_type
         
     | 
| 
      
 9 
     | 
    
         
            +
                        :youtube_video
         
     | 
| 
      
 10 
     | 
    
         
            +
                      end
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
                      class << self
         
     | 
| 
      
 13 
     | 
    
         
            +
                        # Regular expression to check if a given string is a Youtube URL
         
     | 
| 
      
 14 
     | 
    
         
            +
                        # Also used to extract the ID from the URL.
         
     | 
| 
      
 15 
     | 
    
         
            +
                        # @return [Regexp]
         
     | 
| 
      
 16 
     | 
    
         
            +
                        def url_regexp
         
     | 
| 
      
 17 
     | 
    
         
            +
                          %r{
         
     | 
| 
      
 18 
     | 
    
         
            +
                            ^\S*                   # all protocols & sub domains
         
     | 
| 
      
 19 
     | 
    
         
            +
                            (                      # different domains / paths
         
     | 
| 
      
 20 
     | 
    
         
            +
                              youtube\.com/(
         
     | 
| 
      
 21 
     | 
    
         
            +
                                [^/]+/.+/|(v|e(mbed)?)/|.*[?&]v=
         
     | 
| 
      
 22 
     | 
    
         
            +
                              )|
         
     | 
| 
      
 23 
     | 
    
         
            +
                              youtu\.be/
         
     | 
| 
      
 24 
     | 
    
         
            +
                            )
         
     | 
| 
      
 25 
     | 
    
         
            +
                            (?<id>[a-zA-Z0-9_-]+)  # alpha-numerical id, including _-
         
     | 
| 
      
 26 
     | 
    
         
            +
                          }xi
         
     | 
| 
      
 27 
     | 
    
         
            +
                        end
         
     | 
| 
      
 28 
     | 
    
         
            +
                      end
         
     | 
| 
      
 29 
     | 
    
         
            +
                    end
         
     | 
| 
      
 30 
     | 
    
         
            +
                  end
         
     | 
| 
      
 31 
     | 
    
         
            +
                end
         
     | 
| 
      
 32 
     | 
    
         
            +
              end
         
     | 
| 
      
 33 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,38 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module ArticleJSON
         
     | 
| 
      
 2 
     | 
    
         
            +
              module Import
         
     | 
| 
      
 3 
     | 
    
         
            +
                module GoogleDoc
         
     | 
| 
      
 4 
     | 
    
         
            +
                  module HTML
         
     | 
| 
      
 5 
     | 
    
         
            +
                    class HeadingParser
         
     | 
| 
      
 6 
     | 
    
         
            +
                      # @param [Nokogiri::HTML::Node] node
         
     | 
| 
      
 7 
     | 
    
         
            +
                      def initialize(node:)
         
     | 
| 
      
 8 
     | 
    
         
            +
                        @node = node
         
     | 
| 
      
 9 
     | 
    
         
            +
                      end
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
                      # The raw text content of the heading, without any markup
         
     | 
| 
      
 12 
     | 
    
         
            +
                      # @return [String]
         
     | 
| 
      
 13 
     | 
    
         
            +
                      def content
         
     | 
| 
      
 14 
     | 
    
         
            +
                        @node.inner_text
         
     | 
| 
      
 15 
     | 
    
         
            +
                      end
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
                      # Determine the level of the heading
         
     | 
| 
      
 18 
     | 
    
         
            +
                      # The level corresponds to the header tag, e.g. `<h3>` is level 3.
         
     | 
| 
      
 19 
     | 
    
         
            +
                      # @return [Integer]
         
     | 
| 
      
 20 
     | 
    
         
            +
                      def level
         
     | 
| 
      
 21 
     | 
    
         
            +
                        case @node.name
         
     | 
| 
      
 22 
     | 
    
         
            +
                          when 'h1' then 1
         
     | 
| 
      
 23 
     | 
    
         
            +
                          when 'h2' then 2
         
     | 
| 
      
 24 
     | 
    
         
            +
                          when 'h3' then 3
         
     | 
| 
      
 25 
     | 
    
         
            +
                          when 'h4' then 4
         
     | 
| 
      
 26 
     | 
    
         
            +
                          when 'h5' then 5
         
     | 
| 
      
 27 
     | 
    
         
            +
                        end
         
     | 
| 
      
 28 
     | 
    
         
            +
                      end
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
                      # @return [ArticleJSON::Elements::Heading]
         
     | 
| 
      
 31 
     | 
    
         
            +
                      def element
         
     | 
| 
      
 32 
     | 
    
         
            +
                        ArticleJSON::Elements::Heading.new(level: level, content: content)
         
     | 
| 
      
 33 
     | 
    
         
            +
                      end
         
     | 
| 
      
 34 
     | 
    
         
            +
                    end
         
     | 
| 
      
 35 
     | 
    
         
            +
                  end
         
     | 
| 
      
 36 
     | 
    
         
            +
                end
         
     | 
| 
      
 37 
     | 
    
         
            +
              end
         
     | 
| 
      
 38 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,75 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module ArticleJSON
         
     | 
| 
      
 2 
     | 
    
         
            +
              module Import
         
     | 
| 
      
 3 
     | 
    
         
            +
                module GoogleDoc
         
     | 
| 
      
 4 
     | 
    
         
            +
                  module HTML
         
     | 
| 
      
 5 
     | 
    
         
            +
                    class ImageParser
         
     | 
| 
      
 6 
     | 
    
         
            +
                      include Shared::Caption
         
     | 
| 
      
 7 
     | 
    
         
            +
                      include Shared::Float
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
                      # @param [Nokogiri::HTML::Node] node
         
     | 
| 
      
 10 
     | 
    
         
            +
                      # @param [Nokogiri::HTML::Node] caption_node
         
     | 
| 
      
 11 
     | 
    
         
            +
                      # @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer
         
     | 
| 
      
 12 
     | 
    
         
            +
                      def initialize(node:, caption_node:, css_analyzer:)
         
     | 
| 
      
 13 
     | 
    
         
            +
                        @node = node
         
     | 
| 
      
 14 
     | 
    
         
            +
                        @caption_node = caption_node
         
     | 
| 
      
 15 
     | 
    
         
            +
                        @css_analyzer = css_analyzer
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
                        # Main node indicates the floating behavior
         
     | 
| 
      
 18 
     | 
    
         
            +
                        @float_node = @node
         
     | 
| 
      
 19 
     | 
    
         
            +
                      end
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
                      # The value of the image's `src` attribute
         
     | 
| 
      
 22 
     | 
    
         
            +
                      # @return [String]
         
     | 
| 
      
 23 
     | 
    
         
            +
                      def source_url
         
     | 
| 
      
 24 
     | 
    
         
            +
                        image_node.attribute('src').value
         
     | 
| 
      
 25 
     | 
    
         
            +
                      end
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
                      # The node of the actual image
         
     | 
| 
      
 28 
     | 
    
         
            +
                      # @return [Nokogiri::HTML::Node]
         
     | 
| 
      
 29 
     | 
    
         
            +
                      def image_node
         
     | 
| 
      
 30 
     | 
    
         
            +
                        @node.xpath('.//img').first
         
     | 
| 
      
 31 
     | 
    
         
            +
                      end
         
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
      
 33 
     | 
    
         
            +
                      # Check if the image is floating (left, right or not at all)
         
     | 
| 
      
 34 
     | 
    
         
            +
                      # @return [Symbol]
         
     | 
| 
      
 35 
     | 
    
         
            +
                      def float
         
     | 
| 
      
 36 
     | 
    
         
            +
                        super if floatable_size?
         
     | 
| 
      
 37 
     | 
    
         
            +
                      end
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
                      # @return [ArticleJSON::Elements::Image]
         
     | 
| 
      
 40 
     | 
    
         
            +
                      def element
         
     | 
| 
      
 41 
     | 
    
         
            +
                        ArticleJSON::Elements::Image.new(
         
     | 
| 
      
 42 
     | 
    
         
            +
                          source_url: source_url,
         
     | 
| 
      
 43 
     | 
    
         
            +
                          float: float,
         
     | 
| 
      
 44 
     | 
    
         
            +
                          caption: caption
         
     | 
| 
      
 45 
     | 
    
         
            +
                        )
         
     | 
| 
      
 46 
     | 
    
         
            +
                      end
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
                      private
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
                      # Check if the image's width can be determined and is less than 500px
         
     | 
| 
      
 51 
     | 
    
         
            +
                      # This is about 3/4 of the google document width...
         
     | 
| 
      
 52 
     | 
    
         
            +
                      # @return [Boolean]
         
     | 
| 
      
 53 
     | 
    
         
            +
                      def floatable_size?
         
     | 
| 
      
 54 
     | 
    
         
            +
                        image_width && image_width < 500
         
     | 
| 
      
 55 
     | 
    
         
            +
                      end
         
     | 
| 
      
 56 
     | 
    
         
            +
             
     | 
| 
      
 57 
     | 
    
         
            +
                      # Get the specified width of the image if available
         
     | 
| 
      
 58 
     | 
    
         
            +
                      # The width can either be specified in a width attribute or via style
         
     | 
| 
      
 59 
     | 
    
         
            +
                      # attribute. If not, `nil` is returned.
         
     | 
| 
      
 60 
     | 
    
         
            +
                      # @return [Integer]
         
     | 
| 
      
 61 
     | 
    
         
            +
                      def image_width
         
     | 
| 
      
 62 
     | 
    
         
            +
                        @image_width ||=
         
     | 
| 
      
 63 
     | 
    
         
            +
                          if image_node.has_attribute?('width')
         
     | 
| 
      
 64 
     | 
    
         
            +
                            image_node.attribute('width').value.to_i
         
     | 
| 
      
 65 
     | 
    
         
            +
                          elsif image_node.has_attribute?('style')
         
     | 
| 
      
 66 
     | 
    
         
            +
                            regex = /width:\s?(?<px>\d+|(\d+?\.\d+))px/
         
     | 
| 
      
 67 
     | 
    
         
            +
                            match = image_node.attribute('style').value.match(regex)
         
     | 
| 
      
 68 
     | 
    
         
            +
                            match['px'].to_i if match && match['px']
         
     | 
| 
      
 69 
     | 
    
         
            +
                          end
         
     | 
| 
      
 70 
     | 
    
         
            +
                      end
         
     | 
| 
      
 71 
     | 
    
         
            +
                    end
         
     | 
| 
      
 72 
     | 
    
         
            +
                  end
         
     | 
| 
      
 73 
     | 
    
         
            +
                end
         
     | 
| 
      
 74 
     | 
    
         
            +
              end
         
     | 
| 
      
 75 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,46 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module ArticleJSON
         
     | 
| 
      
 2 
     | 
    
         
            +
              module Import
         
     | 
| 
      
 3 
     | 
    
         
            +
                module GoogleDoc
         
     | 
| 
      
 4 
     | 
    
         
            +
                  module HTML
         
     | 
| 
      
 5 
     | 
    
         
            +
                    class ListParser
         
     | 
| 
      
 6 
     | 
    
         
            +
                      # @param [Nokogiri::HTML::Node] node
         
     | 
| 
      
 7 
     | 
    
         
            +
                      # @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer
         
     | 
| 
      
 8 
     | 
    
         
            +
                      def initialize(node:, css_analyzer:)
         
     | 
| 
      
 9 
     | 
    
         
            +
                        @node = node
         
     | 
| 
      
 10 
     | 
    
         
            +
                        @css_analyzer = css_analyzer
         
     | 
| 
      
 11 
     | 
    
         
            +
                      end
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
                      # Determine the list type, either ordered or unordered
         
     | 
| 
      
 14 
     | 
    
         
            +
                      # @return [Symbol]
         
     | 
| 
      
 15 
     | 
    
         
            +
                      def list_type
         
     | 
| 
      
 16 
     | 
    
         
            +
                        case @node.name
         
     | 
| 
      
 17 
     | 
    
         
            +
                          when 'ol' then :ordered
         
     | 
| 
      
 18 
     | 
    
         
            +
                          when 'ul' then :unordered
         
     | 
| 
      
 19 
     | 
    
         
            +
                        end
         
     | 
| 
      
 20 
     | 
    
         
            +
                      end
         
     | 
| 
      
 21 
     | 
    
         
            +
             
     | 
| 
      
 22 
     | 
    
         
            +
                      # Parse the list's sub nodes to get a set of paragraphs
         
     | 
| 
      
 23 
     | 
    
         
            +
                      # @return [Array[ArticleJSON::Elements::Paragraph]]
         
     | 
| 
      
 24 
     | 
    
         
            +
                      def content
         
     | 
| 
      
 25 
     | 
    
         
            +
                        @node
         
     | 
| 
      
 26 
     | 
    
         
            +
                          .children
         
     | 
| 
      
 27 
     | 
    
         
            +
                          .select { |node| node.name == 'li' }
         
     | 
| 
      
 28 
     | 
    
         
            +
                          .map do |node|
         
     | 
| 
      
 29 
     | 
    
         
            +
                            ParagraphParser
         
     | 
| 
      
 30 
     | 
    
         
            +
                              .new(node: node, css_analyzer: @css_analyzer)
         
     | 
| 
      
 31 
     | 
    
         
            +
                              .element
         
     | 
| 
      
 32 
     | 
    
         
            +
                          end
         
     | 
| 
      
 33 
     | 
    
         
            +
                      end
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
                      # @return [ArticleJSON::Elements::List]
         
     | 
| 
      
 36 
     | 
    
         
            +
                      def element
         
     | 
| 
      
 37 
     | 
    
         
            +
                        ArticleJSON::Elements::List.new(
         
     | 
| 
      
 38 
     | 
    
         
            +
                          list_type: list_type,
         
     | 
| 
      
 39 
     | 
    
         
            +
                          content: content
         
     | 
| 
      
 40 
     | 
    
         
            +
                        )
         
     | 
| 
      
 41 
     | 
    
         
            +
                      end
         
     | 
| 
      
 42 
     | 
    
         
            +
                    end
         
     | 
| 
      
 43 
     | 
    
         
            +
                  end
         
     | 
| 
      
 44 
     | 
    
         
            +
                end
         
     | 
| 
      
 45 
     | 
    
         
            +
              end
         
     | 
| 
      
 46 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,111 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module ArticleJSON
         
     | 
| 
      
 2 
     | 
    
         
            +
              module Import
         
     | 
| 
      
 3 
     | 
    
         
            +
                module GoogleDoc
         
     | 
| 
      
 4 
     | 
    
         
            +
                  module HTML
         
     | 
| 
      
 5 
     | 
    
         
            +
                    class NodeAnalyzer
         
     | 
| 
      
 6 
     | 
    
         
            +
                      attr_reader :node
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
                      # @param [Nokogiri::HTML::Node] node
         
     | 
| 
      
 9 
     | 
    
         
            +
                      def initialize(node)
         
     | 
| 
      
 10 
     | 
    
         
            +
                        @node = node
         
     | 
| 
      
 11 
     | 
    
         
            +
                      end
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
                      # Check if a node equals a certain text
         
     | 
| 
      
 14 
     | 
    
         
            +
                      # @param [String] text
         
     | 
| 
      
 15 
     | 
    
         
            +
                      # @return [Boolean]
         
     | 
| 
      
 16 
     | 
    
         
            +
                      def has_text?(text)
         
     | 
| 
      
 17 
     | 
    
         
            +
                        node.inner_text.strip.downcase == text.strip.downcase
         
     | 
| 
      
 18 
     | 
    
         
            +
                      end
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
                      # Check if the node is empty, i.e. not containing any text
         
     | 
| 
      
 21 
     | 
    
         
            +
                      # Given that images are the only nodes without text, we have to make
         
     | 
| 
      
 22 
     | 
    
         
            +
                      # sure that it's not an image.
         
     | 
| 
      
 23 
     | 
    
         
            +
                      # @return [Boolean]
         
     | 
| 
      
 24 
     | 
    
         
            +
                      def empty?
         
     | 
| 
      
 25 
     | 
    
         
            +
                        return @is_empty if defined? @is_empty
         
     | 
| 
      
 26 
     | 
    
         
            +
                        @is_empty = node.inner_text.strip.empty? && !image? && !hr?
         
     | 
| 
      
 27 
     | 
    
         
            +
                      end
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
                      # Check if the node is a header tag between <h1> and <h5>
         
     | 
| 
      
 30 
     | 
    
         
            +
                      # @return [Boolean]
         
     | 
| 
      
 31 
     | 
    
         
            +
                      def heading?
         
     | 
| 
      
 32 
     | 
    
         
            +
                        return @is_heading if defined? @is_heading
         
     | 
| 
      
 33 
     | 
    
         
            +
                        @is_heading = %w(h1 h2 h3 h4 h5).include?(node.name)
         
     | 
| 
      
 34 
     | 
    
         
            +
                      end
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
                      # Check if the node is a horizontal line (i.e. `<hr>`)
         
     | 
| 
      
 37 
     | 
    
         
            +
                      # @return [Boolean]
         
     | 
| 
      
 38 
     | 
    
         
            +
                      def hr?
         
     | 
| 
      
 39 
     | 
    
         
            +
                        node.name == 'hr'
         
     | 
| 
      
 40 
     | 
    
         
            +
                      end
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
                      # Check if the node is a normal text paragraph
         
     | 
| 
      
 43 
     | 
    
         
            +
                      # @return [Boolean]
         
     | 
| 
      
 44 
     | 
    
         
            +
                      def paragraph?
         
     | 
| 
      
 45 
     | 
    
         
            +
                        return @is_paragraph if defined? @is_paragraph
         
     | 
| 
      
 46 
     | 
    
         
            +
                        @is_paragraph =
         
     | 
| 
      
 47 
     | 
    
         
            +
                          node.name == 'p' &&
         
     | 
| 
      
 48 
     | 
    
         
            +
                            !empty? &&
         
     | 
| 
      
 49 
     | 
    
         
            +
                            !image? &&
         
     | 
| 
      
 50 
     | 
    
         
            +
                            !text_box? &&
         
     | 
| 
      
 51 
     | 
    
         
            +
                            !quote? &&
         
     | 
| 
      
 52 
     | 
    
         
            +
                            !embed?
         
     | 
| 
      
 53 
     | 
    
         
            +
                      end
         
     | 
| 
      
 54 
     | 
    
         
            +
             
     | 
| 
      
 55 
     | 
    
         
            +
                      # Check if the node contains an ordered or unordered list
         
     | 
| 
      
 56 
     | 
    
         
            +
                      # @return [Boolean]
         
     | 
| 
      
 57 
     | 
    
         
            +
                      def list?
         
     | 
| 
      
 58 
     | 
    
         
            +
                        return @is_list if defined? @is_list
         
     | 
| 
      
 59 
     | 
    
         
            +
                        @is_list = %w(ul ol).include?(node.name)
         
     | 
| 
      
 60 
     | 
    
         
            +
                      end
         
     | 
| 
      
 61 
     | 
    
         
            +
             
     | 
| 
      
 62 
     | 
    
         
            +
                      # Check if the node starts a text box
         
     | 
| 
      
 63 
     | 
    
         
            +
                      # Text boxes start with a single line saying "Textbox:" or "Highlight:".
         
     | 
| 
      
 64 
     | 
    
         
            +
                      # @return [Boolean]
         
     | 
| 
      
 65 
     | 
    
         
            +
                      def text_box?
         
     | 
| 
      
 66 
     | 
    
         
            +
                        return @is_text_box if defined? @is_text_box
         
     | 
| 
      
 67 
     | 
    
         
            +
                        @is_text_box = has_text?('textbox:') || has_text?('highlight:')
         
     | 
| 
      
 68 
     | 
    
         
            +
                      end
         
     | 
| 
      
 69 
     | 
    
         
            +
             
     | 
| 
      
 70 
     | 
    
         
            +
                      # Check if the node starts a quote
         
     | 
| 
      
 71 
     | 
    
         
            +
                      # Quotes start with a single line saying "Quote:".
         
     | 
| 
      
 72 
     | 
    
         
            +
                      # @return [Boolean]
         
     | 
| 
      
 73 
     | 
    
         
            +
                      def quote?
         
     | 
| 
      
 74 
     | 
    
         
            +
                        return @is_quote if defined? @is_quote
         
     | 
| 
      
 75 
     | 
    
         
            +
                        @is_quote = has_text?('quote:')
         
     | 
| 
      
 76 
     | 
    
         
            +
                      end
         
     | 
| 
      
 77 
     | 
    
         
            +
             
     | 
| 
      
 78 
     | 
    
         
            +
                      # Check if the node contains an image
         
     | 
| 
      
 79 
     | 
    
         
            +
                      # @return [Boolean]
         
     | 
| 
      
 80 
     | 
    
         
            +
                      def image?
         
     | 
| 
      
 81 
     | 
    
         
            +
                        return @is_image if defined? @is_image
         
     | 
| 
      
 82 
     | 
    
         
            +
                        @is_image = node.xpath('.//img').length > 0
         
     | 
| 
      
 83 
     | 
    
         
            +
                      end
         
     | 
| 
      
 84 
     | 
    
         
            +
             
     | 
| 
      
 85 
     | 
    
         
            +
                      # Check if the node contains an embedded element
         
     | 
| 
      
 86 
     | 
    
         
            +
                      # @return [Boolean]
         
     | 
| 
      
 87 
     | 
    
         
            +
                      def embed?
         
     | 
| 
      
 88 
     | 
    
         
            +
                        return @is_embed if defined? @is_embed
         
     | 
| 
      
 89 
     | 
    
         
            +
                        @is_embed = EmbeddedParser.supported?(node)
         
     | 
| 
      
 90 
     | 
    
         
            +
                      end
         
     | 
| 
      
 91 
     | 
    
         
            +
             
     | 
| 
      
 92 
     | 
    
         
            +
                      # Determine the type of this node
         
     | 
| 
      
 93 
     | 
    
         
            +
                      # The type is one of the elements supported by article_json.
         
     | 
| 
      
 94 
     | 
    
         
            +
                      # @return [Symbol]
         
     | 
| 
      
 95 
     | 
    
         
            +
                      def type
         
     | 
| 
      
 96 
     | 
    
         
            +
                        return :empty if empty?
         
     | 
| 
      
 97 
     | 
    
         
            +
                        return :hr if hr?
         
     | 
| 
      
 98 
     | 
    
         
            +
                        return :heading if heading?
         
     | 
| 
      
 99 
     | 
    
         
            +
                        return :paragraph if paragraph?
         
     | 
| 
      
 100 
     | 
    
         
            +
                        return :list if list?
         
     | 
| 
      
 101 
     | 
    
         
            +
                        return :text_box if text_box?
         
     | 
| 
      
 102 
     | 
    
         
            +
                        return :quote if quote?
         
     | 
| 
      
 103 
     | 
    
         
            +
                        return :image if image?
         
     | 
| 
      
 104 
     | 
    
         
            +
                        return :embed if embed?
         
     | 
| 
      
 105 
     | 
    
         
            +
                        :unknown
         
     | 
| 
      
 106 
     | 
    
         
            +
                      end
         
     | 
| 
      
 107 
     | 
    
         
            +
                    end
         
     | 
| 
      
 108 
     | 
    
         
            +
                  end
         
     | 
| 
      
 109 
     | 
    
         
            +
                end
         
     | 
| 
      
 110 
     | 
    
         
            +
              end
         
     | 
| 
      
 111 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,26 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module ArticleJSON
         
     | 
| 
      
 2 
     | 
    
         
            +
              module Import
         
     | 
| 
      
 3 
     | 
    
         
            +
                module GoogleDoc
         
     | 
| 
      
 4 
     | 
    
         
            +
                  module HTML
         
     | 
| 
      
 5 
     | 
    
         
            +
                    class ParagraphParser
         
     | 
| 
      
 6 
     | 
    
         
            +
                      # @param [Nokogiri::HTML::Node] node
         
     | 
| 
      
 7 
     | 
    
         
            +
                      # @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer
         
     | 
| 
      
 8 
     | 
    
         
            +
                      def initialize(node:, css_analyzer:)
         
     | 
| 
      
 9 
     | 
    
         
            +
                        @node = node
         
     | 
| 
      
 10 
     | 
    
         
            +
                        @css_analyzer = css_analyzer
         
     | 
| 
      
 11 
     | 
    
         
            +
                      end
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
                      # @return [Array[ArticleJSON::Elements::Text]]
         
     | 
| 
      
 14 
     | 
    
         
            +
                      def content
         
     | 
| 
      
 15 
     | 
    
         
            +
                        TextParser.extract(node: @node, css_analyzer: @css_analyzer)
         
     | 
| 
      
 16 
     | 
    
         
            +
                      end
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
                      # @return [ArticleJSON::Elements::Paragraph]
         
     | 
| 
      
 19 
     | 
    
         
            +
                      def element
         
     | 
| 
      
 20 
     | 
    
         
            +
                        ArticleJSON::Elements::Paragraph.new(content: content)
         
     | 
| 
      
 21 
     | 
    
         
            +
                      end
         
     | 
| 
      
 22 
     | 
    
         
            +
                    end
         
     | 
| 
      
 23 
     | 
    
         
            +
                  end
         
     | 
| 
      
 24 
     | 
    
         
            +
                end
         
     | 
| 
      
 25 
     | 
    
         
            +
              end
         
     | 
| 
      
 26 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,125 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module ArticleJSON
         
     | 
| 
      
 2 
     | 
    
         
            +
              module Import
         
     | 
| 
      
 3 
     | 
    
         
            +
                module GoogleDoc
         
     | 
| 
      
 4 
     | 
    
         
            +
                  module HTML
         
     | 
| 
      
 5 
     | 
    
         
            +
                    class Parser
         
     | 
| 
      
 6 
     | 
    
         
            +
                      # @param [String] html
         
     | 
| 
      
 7 
     | 
    
         
            +
                      def initialize(html)
         
     | 
| 
      
 8 
     | 
    
         
            +
                        doc = Nokogiri::HTML(html)
         
     | 
| 
      
 9 
     | 
    
         
            +
                        @body_enumerator = doc.xpath('//body').last.children.to_enum
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
                        css_node = doc.xpath('//head/style').last
         
     | 
| 
      
 12 
     | 
    
         
            +
                        @css_analyzer = CSSAnalyzer.new(css_node&.inner_text)
         
     | 
| 
      
 13 
     | 
    
         
            +
                      end
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
                      # Parse the body of the document and return the result
         
     | 
| 
      
 16 
     | 
    
         
            +
                      # @return [Array[ArticleJSON::Elements::Base]]
         
     | 
| 
      
 17 
     | 
    
         
            +
                      def parsed_content
         
     | 
| 
      
 18 
     | 
    
         
            +
                        @parsed_content ||= parse_body
         
     | 
| 
      
 19 
     | 
    
         
            +
                      end
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
                      private
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
                      # Loop over all body nodes and parse them
         
     | 
| 
      
 24 
     | 
    
         
            +
                      # @return [Array[ArticleJSON::Elements::Base]]
         
     | 
| 
      
 25 
     | 
    
         
            +
                      def parse_body
         
     | 
| 
      
 26 
     | 
    
         
            +
                        @parsed_content = []
         
     | 
| 
      
 27 
     | 
    
         
            +
                        while body_has_more_nodes?
         
     | 
| 
      
 28 
     | 
    
         
            +
                          @parsed_content << begin
         
     | 
| 
      
 29 
     | 
    
         
            +
                            @current_node = NodeAnalyzer.new(@body_enumerator.next)
         
     | 
| 
      
 30 
     | 
    
         
            +
                            parse_current_node || next
         
     | 
| 
      
 31 
     | 
    
         
            +
                          end
         
     | 
| 
      
 32 
     | 
    
         
            +
                        end
         
     | 
| 
      
 33 
     | 
    
         
            +
                        @parsed_content
         
     | 
| 
      
 34 
     | 
    
         
            +
                      end
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
                      # Parse the current node and return an element, if available
         
     | 
| 
      
 37 
     | 
    
         
            +
                      # @return [ArticleJSON::Elements::Base]
         
     | 
| 
      
 38 
     | 
    
         
            +
                      def parse_current_node
         
     | 
| 
      
 39 
     | 
    
         
            +
                        case @current_node.type
         
     | 
| 
      
 40 
     | 
    
         
            +
                        when :heading then parse_heading
         
     | 
| 
      
 41 
     | 
    
         
            +
                        when :paragraph then parse_paragraph
         
     | 
| 
      
 42 
     | 
    
         
            +
                        when :list then parse_list
         
     | 
| 
      
 43 
     | 
    
         
            +
                        when :image then parse_image
         
     | 
| 
      
 44 
     | 
    
         
            +
                        when :text_box then parse_text_box
         
     | 
| 
      
 45 
     | 
    
         
            +
                        when :quote then parse_quote
         
     | 
| 
      
 46 
     | 
    
         
            +
                        when :embed then parse_embed
         
     | 
| 
      
 47 
     | 
    
         
            +
                        when :hr, :empty, :unknown then nil
         
     | 
| 
      
 48 
     | 
    
         
            +
                        end
         
     | 
| 
      
 49 
     | 
    
         
            +
                      end
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
                      # @return [ArticleJSON::Elements::Heading]
         
     | 
| 
      
 52 
     | 
    
         
            +
                      def parse_heading
         
     | 
| 
      
 53 
     | 
    
         
            +
                        HeadingParser.new(node: @current_node.node).element
         
     | 
| 
      
 54 
     | 
    
         
            +
                      end
         
     | 
| 
      
 55 
     | 
    
         
            +
             
     | 
| 
      
 56 
     | 
    
         
            +
                      # @return [ArticleJSON::Elements::Paragraph]
         
     | 
| 
      
 57 
     | 
    
         
            +
                      def parse_paragraph
         
     | 
| 
      
 58 
     | 
    
         
            +
                        ParagraphParser
         
     | 
| 
      
 59 
     | 
    
         
            +
                          .new(node: @current_node.node, css_analyzer: @css_analyzer)
         
     | 
| 
      
 60 
     | 
    
         
            +
                          .element
         
     | 
| 
      
 61 
     | 
    
         
            +
                      end
         
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
      
 63 
     | 
    
         
            +
                      # @return [ArticleJSON::Elements::List]
         
     | 
| 
      
 64 
     | 
    
         
            +
                      def parse_list
         
     | 
| 
      
 65 
     | 
    
         
            +
                        ListParser
         
     | 
| 
      
 66 
     | 
    
         
            +
                          .new(node: @current_node.node, css_analyzer: @css_analyzer)
         
     | 
| 
      
 67 
     | 
    
         
            +
                          .element
         
     | 
| 
      
 68 
     | 
    
         
            +
                      end
         
     | 
| 
      
 69 
     | 
    
         
            +
             
     | 
| 
      
 70 
     | 
    
         
            +
                      # @return [ArticleJSON::Elements::Image]
         
     | 
| 
      
 71 
     | 
    
         
            +
                      def parse_image
         
     | 
| 
      
 72 
     | 
    
         
            +
                        ImageParser
         
     | 
| 
      
 73 
     | 
    
         
            +
                          .new(
         
     | 
| 
      
 74 
     | 
    
         
            +
                            node: @current_node.node,
         
     | 
| 
      
 75 
     | 
    
         
            +
                            caption_node: @body_enumerator.next,
         
     | 
| 
      
 76 
     | 
    
         
            +
                            css_analyzer: @css_analyzer
         
     | 
| 
      
 77 
     | 
    
         
            +
                          )
         
     | 
| 
      
 78 
     | 
    
         
            +
                          .element
         
     | 
| 
      
 79 
     | 
    
         
            +
                      end
         
     | 
| 
      
 80 
     | 
    
         
            +
             
     | 
| 
      
 81 
     | 
    
         
            +
                      # @return [ArticleJSON::Elements::TextBox]
         
     | 
| 
      
 82 
     | 
    
         
            +
                      def parse_text_box
         
     | 
| 
      
 83 
     | 
    
         
            +
                        TextBoxParser
         
     | 
| 
      
 84 
     | 
    
         
            +
                          .new(nodes: nodes_until_hr, css_analyzer: @css_analyzer)
         
     | 
| 
      
 85 
     | 
    
         
            +
                          .element
         
     | 
| 
      
 86 
     | 
    
         
            +
                      end
         
     | 
| 
      
 87 
     | 
    
         
            +
             
     | 
| 
      
 88 
     | 
    
         
            +
                      # @return [ArticleJSON::Elements::Quote]
         
     | 
| 
      
 89 
     | 
    
         
            +
                      def parse_quote
         
     | 
| 
      
 90 
     | 
    
         
            +
                        QuoteParser
         
     | 
| 
      
 91 
     | 
    
         
            +
                          .new(nodes: nodes_until_hr, css_analyzer: @css_analyzer)
         
     | 
| 
      
 92 
     | 
    
         
            +
                          .element
         
     | 
| 
      
 93 
     | 
    
         
            +
                      end
         
     | 
| 
      
 94 
     | 
    
         
            +
             
     | 
| 
      
 95 
     | 
    
         
            +
                      # @return [ArticleJSON::Elements::Embed]
         
     | 
| 
      
 96 
     | 
    
         
            +
                      def parse_embed
         
     | 
| 
      
 97 
     | 
    
         
            +
                        EmbeddedParser.build(
         
     | 
| 
      
 98 
     | 
    
         
            +
                          node: @current_node.node,
         
     | 
| 
      
 99 
     | 
    
         
            +
                          caption_node: @body_enumerator.next,
         
     | 
| 
      
 100 
     | 
    
         
            +
                          css_analyzer: @css_analyzer
         
     | 
| 
      
 101 
     | 
    
         
            +
                        )
         
     | 
| 
      
 102 
     | 
    
         
            +
                      end
         
     | 
| 
      
 103 
     | 
    
         
            +
             
     | 
| 
      
 104 
     | 
    
         
            +
                      # Collect all nodes until a horizontal line, advancing the enumerator
         
     | 
| 
      
 105 
     | 
    
         
            +
                      # @return [Array[Nokogiri::HTML::Node]]
         
     | 
| 
      
 106 
     | 
    
         
            +
                      def nodes_until_hr
         
     | 
| 
      
 107 
     | 
    
         
            +
                        nodes = []
         
     | 
| 
      
 108 
     | 
    
         
            +
                        until NodeAnalyzer.new(@body_enumerator.peek).hr?
         
     | 
| 
      
 109 
     | 
    
         
            +
                          nodes << @body_enumerator.next
         
     | 
| 
      
 110 
     | 
    
         
            +
                        end
         
     | 
| 
      
 111 
     | 
    
         
            +
                        nodes
         
     | 
| 
      
 112 
     | 
    
         
            +
                      end
         
     | 
| 
      
 113 
     | 
    
         
            +
             
     | 
| 
      
 114 
     | 
    
         
            +
                      # @return [Boolean]
         
     | 
| 
      
 115 
     | 
    
         
            +
                      def body_has_more_nodes?
         
     | 
| 
      
 116 
     | 
    
         
            +
                        @body_enumerator.peek
         
     | 
| 
      
 117 
     | 
    
         
            +
                        true
         
     | 
| 
      
 118 
     | 
    
         
            +
                      rescue StopIteration
         
     | 
| 
      
 119 
     | 
    
         
            +
                        false
         
     | 
| 
      
 120 
     | 
    
         
            +
                      end
         
     | 
| 
      
 121 
     | 
    
         
            +
                    end
         
     | 
| 
      
 122 
     | 
    
         
            +
                  end
         
     | 
| 
      
 123 
     | 
    
         
            +
                end
         
     | 
| 
      
 124 
     | 
    
         
            +
              end
         
     | 
| 
      
 125 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,46 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module ArticleJSON
         
     | 
| 
      
 2 
     | 
    
         
            +
              module Import
         
     | 
| 
      
 3 
     | 
    
         
            +
                module GoogleDoc
         
     | 
| 
      
 4 
     | 
    
         
            +
                  module HTML
         
     | 
| 
      
 5 
     | 
    
         
            +
                    class QuoteParser
         
     | 
| 
      
 6 
     | 
    
         
            +
                      include Shared::Caption
         
     | 
| 
      
 7 
     | 
    
         
            +
                      include Shared::Float
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
                      # @param [Array[Nokogiri::HTML::Node]] nodes
         
     | 
| 
      
 10 
     | 
    
         
            +
                      # @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer
         
     | 
| 
      
 11 
     | 
    
         
            +
                      def initialize(nodes:, css_analyzer:)
         
     | 
| 
      
 12 
     | 
    
         
            +
                        @nodes = nodes.reject { |node| NodeAnalyzer.new(node).empty? }
         
     | 
| 
      
 13 
     | 
    
         
            +
                        @css_analyzer = css_analyzer
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
                        # First node of the quote indicates floating behavior
         
     | 
| 
      
 16 
     | 
    
         
            +
                        @float_node = @nodes.first
         
     | 
| 
      
 17 
     | 
    
         
            +
                        # Last node of the quote contains the caption
         
     | 
| 
      
 18 
     | 
    
         
            +
                        @caption_node = @nodes.last
         
     | 
| 
      
 19 
     | 
    
         
            +
                      end
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
                      # Parse the quote's nodes to get a set of paragraphs
         
     | 
| 
      
 22 
     | 
    
         
            +
                      # The last node is ignored as it contains the quote caption
         
     | 
| 
      
 23 
     | 
    
         
            +
                      # @return [Array[ArticleJSON::Elements::Paragraph]]
         
     | 
| 
      
 24 
     | 
    
         
            +
                      def content
         
     | 
| 
      
 25 
     | 
    
         
            +
                        @nodes
         
     | 
| 
      
 26 
     | 
    
         
            +
                          .take(@nodes.size - 1)
         
     | 
| 
      
 27 
     | 
    
         
            +
                          .map do |node|
         
     | 
| 
      
 28 
     | 
    
         
            +
                            ParagraphParser
         
     | 
| 
      
 29 
     | 
    
         
            +
                              .new(node: node, css_analyzer: @css_analyzer)
         
     | 
| 
      
 30 
     | 
    
         
            +
                              .element
         
     | 
| 
      
 31 
     | 
    
         
            +
                          end
         
     | 
| 
      
 32 
     | 
    
         
            +
                      end
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
                      # @return [ArticleJSON::Elements::Quote]
         
     | 
| 
      
 35 
     | 
    
         
            +
                      def element
         
     | 
| 
      
 36 
     | 
    
         
            +
                        ArticleJSON::Elements::Quote.new(
         
     | 
| 
      
 37 
     | 
    
         
            +
                          content: content,
         
     | 
| 
      
 38 
     | 
    
         
            +
                          caption: caption,
         
     | 
| 
      
 39 
     | 
    
         
            +
                          float: float
         
     | 
| 
      
 40 
     | 
    
         
            +
                        )
         
     | 
| 
      
 41 
     | 
    
         
            +
                      end
         
     | 
| 
      
 42 
     | 
    
         
            +
                    end
         
     | 
| 
      
 43 
     | 
    
         
            +
                  end
         
     | 
| 
      
 44 
     | 
    
         
            +
                end
         
     | 
| 
      
 45 
     | 
    
         
            +
              end
         
     | 
| 
      
 46 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,20 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module ArticleJSON
         
     | 
| 
      
 2 
     | 
    
         
            +
              module Import
         
     | 
| 
      
 3 
     | 
    
         
            +
                module GoogleDoc
         
     | 
| 
      
 4 
     | 
    
         
            +
                  module HTML
         
     | 
| 
      
 5 
     | 
    
         
            +
                    module Shared
         
     | 
| 
      
 6 
     | 
    
         
            +
                      module Caption
         
     | 
| 
      
 7 
     | 
    
         
            +
                        # Parse the caption node
         
     | 
| 
      
 8 
     | 
    
         
            +
                        # @return [Array[ArticleJSON::Elements::Text]]
         
     | 
| 
      
 9 
     | 
    
         
            +
                        def caption
         
     | 
| 
      
 10 
     | 
    
         
            +
                          ArticleJSON::Import::GoogleDoc::HTML::TextParser.extract(
         
     | 
| 
      
 11 
     | 
    
         
            +
                            node: @caption_node,
         
     | 
| 
      
 12 
     | 
    
         
            +
                            css_analyzer: @css_analyzer
         
     | 
| 
      
 13 
     | 
    
         
            +
                          )
         
     | 
| 
      
 14 
     | 
    
         
            +
                        end
         
     | 
| 
      
 15 
     | 
    
         
            +
                      end
         
     | 
| 
      
 16 
     | 
    
         
            +
                    end
         
     | 
| 
      
 17 
     | 
    
         
            +
                  end
         
     | 
| 
      
 18 
     | 
    
         
            +
                end
         
     | 
| 
      
 19 
     | 
    
         
            +
              end
         
     | 
| 
      
 20 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,21 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module ArticleJSON
         
     | 
| 
      
 2 
     | 
    
         
            +
              module Import
         
     | 
| 
      
 3 
     | 
    
         
            +
                module GoogleDoc
         
     | 
| 
      
 4 
     | 
    
         
            +
                  module HTML
         
     | 
| 
      
 5 
     | 
    
         
            +
                    module Shared
         
     | 
| 
      
 6 
     | 
    
         
            +
                      module Float
         
     | 
| 
      
 7 
     | 
    
         
            +
                        # Check if the quote is floating (left, right or not at all)
         
     | 
| 
      
 8 
     | 
    
         
            +
                        # @return [Symbol]
         
     | 
| 
      
 9 
     | 
    
         
            +
                        def float
         
     | 
| 
      
 10 
     | 
    
         
            +
                          return unless @float_node.has_attribute?('class')
         
     | 
| 
      
 11 
     | 
    
         
            +
                          node_class = @float_node.attribute('class').value || ''
         
     | 
| 
      
 12 
     | 
    
         
            +
                          return :right if @css_analyzer.right_aligned?(node_class)
         
     | 
| 
      
 13 
     | 
    
         
            +
                          return :left if @css_analyzer.left_aligned?(node_class)
         
     | 
| 
      
 14 
     | 
    
         
            +
                          nil
         
     | 
| 
      
 15 
     | 
    
         
            +
                        end
         
     | 
| 
      
 16 
     | 
    
         
            +
                      end
         
     | 
| 
      
 17 
     | 
    
         
            +
                    end
         
     | 
| 
      
 18 
     | 
    
         
            +
                  end
         
     | 
| 
      
 19 
     | 
    
         
            +
                end
         
     | 
| 
      
 20 
     | 
    
         
            +
              end
         
     | 
| 
      
 21 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,49 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module ArticleJSON
         
     | 
| 
      
 2 
     | 
    
         
            +
              module Import
         
     | 
| 
      
 3 
     | 
    
         
            +
                module GoogleDoc
         
     | 
| 
      
 4 
     | 
    
         
            +
                  module HTML
         
     | 
| 
      
 5 
     | 
    
         
            +
                    class TextBoxParser
         
     | 
| 
      
 6 
     | 
    
         
            +
                      include Shared::Float
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
                      # @param [Array[Nokogiri::HTML::Node]] nodes
         
     | 
| 
      
 9 
     | 
    
         
            +
                      # @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer
         
     | 
| 
      
 10 
     | 
    
         
            +
                      def initialize(nodes:, css_analyzer:)
         
     | 
| 
      
 11 
     | 
    
         
            +
                        @nodes = nodes.reject { |node| NodeAnalyzer.new(node).empty? }
         
     | 
| 
      
 12 
     | 
    
         
            +
                        @css_analyzer = css_analyzer
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
                        # First node of the text box indicates floating behavior
         
     | 
| 
      
 15 
     | 
    
         
            +
                        @float_node = @nodes.first
         
     | 
| 
      
 16 
     | 
    
         
            +
                      end
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
                      # Parse the text box's nodes to get a list of sub elements
         
     | 
| 
      
 19 
     | 
    
         
            +
                      # Supported sub elements are: headings, paragraphs & lists.
         
     | 
| 
      
 20 
     | 
    
         
            +
                      # @return [Array]
         
     | 
| 
      
 21 
     | 
    
         
            +
                      def content
         
     | 
| 
      
 22 
     | 
    
         
            +
                        @nodes.map { |node| parse_sub_node(node) }.compact
         
     | 
| 
      
 23 
     | 
    
         
            +
                      end
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
                      # Hash representation of this text box
         
     | 
| 
      
 26 
     | 
    
         
            +
                      # @return [ArticleJSON::Elements::TextBox]
         
     | 
| 
      
 27 
     | 
    
         
            +
                      def element
         
     | 
| 
      
 28 
     | 
    
         
            +
                        ArticleJSON::Elements::TextBox.new(float: float, content: content)
         
     | 
| 
      
 29 
     | 
    
         
            +
                      end
         
     | 
| 
      
 30 
     | 
    
         
            +
             
     | 
| 
      
 31 
     | 
    
         
            +
                      private
         
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
      
 33 
     | 
    
         
            +
                      def parse_sub_node(node)
         
     | 
| 
      
 34 
     | 
    
         
            +
                        case NodeAnalyzer.new(node).type
         
     | 
| 
      
 35 
     | 
    
         
            +
                        when :heading
         
     | 
| 
      
 36 
     | 
    
         
            +
                          HeadingParser.new(node: node).element
         
     | 
| 
      
 37 
     | 
    
         
            +
                        when :paragraph
         
     | 
| 
      
 38 
     | 
    
         
            +
                          ParagraphParser
         
     | 
| 
      
 39 
     | 
    
         
            +
                            .new(node: node, css_analyzer: @css_analyzer)
         
     | 
| 
      
 40 
     | 
    
         
            +
                            .element
         
     | 
| 
      
 41 
     | 
    
         
            +
                        when :list
         
     | 
| 
      
 42 
     | 
    
         
            +
                          ListParser.new(node: node, css_analyzer: @css_analyzer).element
         
     | 
| 
      
 43 
     | 
    
         
            +
                        end
         
     | 
| 
      
 44 
     | 
    
         
            +
                      end
         
     | 
| 
      
 45 
     | 
    
         
            +
                    end
         
     | 
| 
      
 46 
     | 
    
         
            +
                  end
         
     | 
| 
      
 47 
     | 
    
         
            +
                end
         
     | 
| 
      
 48 
     | 
    
         
            +
              end
         
     | 
| 
      
 49 
     | 
    
         
            +
            end
         
     |