article_json 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +12 -0
- data/LICENSE +21 -0
- data/README.md +78 -0
- data/bin/article_json_export_google_doc.rb +22 -0
- data/bin/article_json_export_html.rb +14 -0
- data/bin/article_json_parse_google_doc.rb +14 -0
- data/bin/update_reference_document.sh +18 -0
- data/lib/article_json/article.rb +53 -0
- data/lib/article_json/configuration.rb +24 -0
- data/lib/article_json/elements/base.rb +40 -0
- data/lib/article_json/elements/embed.rb +58 -0
- data/lib/article_json/elements/heading.rb +37 -0
- data/lib/article_json/elements/image.rb +41 -0
- data/lib/article_json/elements/list.rb +37 -0
- data/lib/article_json/elements/paragraph.rb +31 -0
- data/lib/article_json/elements/quote.rb +41 -0
- data/lib/article_json/elements/text.rb +45 -0
- data/lib/article_json/elements/text_box.rb +37 -0
- data/lib/article_json/export/html/elements/base.rb +59 -0
- data/lib/article_json/export/html/elements/embed.rb +28 -0
- data/lib/article_json/export/html/elements/heading.rb +19 -0
- data/lib/article_json/export/html/elements/image.rb +33 -0
- data/lib/article_json/export/html/elements/list.rb +25 -0
- data/lib/article_json/export/html/elements/paragraph.rb +17 -0
- data/lib/article_json/export/html/elements/quote.rb +29 -0
- data/lib/article_json/export/html/elements/shared/caption.rb +22 -0
- data/lib/article_json/export/html/elements/shared/float.rb +17 -0
- data/lib/article_json/export/html/elements/text.rb +44 -0
- data/lib/article_json/export/html/elements/text_box.rb +25 -0
- data/lib/article_json/export/html/exporter.rb +22 -0
- data/lib/article_json/import/google_doc/html/css_analyzer.rb +144 -0
- data/lib/article_json/import/google_doc/html/embedded_facebook_video_parser.rb +33 -0
- data/lib/article_json/import/google_doc/html/embedded_parser.rb +113 -0
- data/lib/article_json/import/google_doc/html/embedded_slideshare_parser.rb +36 -0
- data/lib/article_json/import/google_doc/html/embedded_tweet_parser.rb +37 -0
- data/lib/article_json/import/google_doc/html/embedded_vimeo_video_parser.rb +29 -0
- data/lib/article_json/import/google_doc/html/embedded_youtube_video_parser.rb +33 -0
- data/lib/article_json/import/google_doc/html/heading_parser.rb +38 -0
- data/lib/article_json/import/google_doc/html/image_parser.rb +75 -0
- data/lib/article_json/import/google_doc/html/list_parser.rb +46 -0
- data/lib/article_json/import/google_doc/html/node_analyzer.rb +111 -0
- data/lib/article_json/import/google_doc/html/paragraph_parser.rb +26 -0
- data/lib/article_json/import/google_doc/html/parser.rb +125 -0
- data/lib/article_json/import/google_doc/html/quote_parser.rb +46 -0
- data/lib/article_json/import/google_doc/html/shared/caption.rb +20 -0
- data/lib/article_json/import/google_doc/html/shared/float.rb +21 -0
- data/lib/article_json/import/google_doc/html/text_box_parser.rb +49 -0
- data/lib/article_json/import/google_doc/html/text_parser.rb +89 -0
- data/lib/article_json/utils/o_embed_resolver/base.rb +63 -0
- data/lib/article_json/utils/o_embed_resolver/facebook_video.rb +21 -0
- data/lib/article_json/utils/o_embed_resolver/slideshare.rb +22 -0
- data/lib/article_json/utils/o_embed_resolver/tweet.rb +23 -0
- data/lib/article_json/utils/o_embed_resolver/vimeo_video.rb +21 -0
- data/lib/article_json/utils/o_embed_resolver/youtube_video.rb +21 -0
- data/lib/article_json/utils.rb +11 -0
- data/lib/article_json/version.rb +3 -0
- data/lib/article_json.rb +55 -0
- metadata +189 -0
@@ -0,0 +1,37 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Import
|
3
|
+
module GoogleDoc
|
4
|
+
module HTML
|
5
|
+
class EmbeddedTweetParser < EmbeddedParser
|
6
|
+
# The type of this embedded element
|
7
|
+
# @return [Symbol]
|
8
|
+
def embed_type
|
9
|
+
:tweet
|
10
|
+
end
|
11
|
+
|
12
|
+
# Extract the tweet ID (including the handle) from an URL
|
13
|
+
# @return [String]
|
14
|
+
def embed_id
|
15
|
+
match = @node.inner_text.strip.match(self.class.url_regexp)
|
16
|
+
"#{match[:handle]}/#{match[:id]}" if match
|
17
|
+
end
|
18
|
+
|
19
|
+
class << self
|
20
|
+
# Regular expression to check if a given string is a Twitter URL
|
21
|
+
# Also used to extract the ID from the URL.
|
22
|
+
# @return [Regexp]
|
23
|
+
def url_regexp
|
24
|
+
%r{
|
25
|
+
^\S* # all protocols & sub domains
|
26
|
+
twitter\.com/ # domain
|
27
|
+
(?<handle>[^#/]+) # twitter handle
|
28
|
+
(?:\#|/status/|/statuses/) # optional path or hash char
|
29
|
+
(?<id>\d+) # numeric tweet id
|
30
|
+
}xi
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Import
|
3
|
+
module GoogleDoc
|
4
|
+
module HTML
|
5
|
+
class EmbeddedVimeoVideoParser < EmbeddedParser
|
6
|
+
# The type of this embedded element
|
7
|
+
# @return [Symbol]
|
8
|
+
def embed_type
|
9
|
+
:vimeo_video
|
10
|
+
end
|
11
|
+
|
12
|
+
class << self
|
13
|
+
# Regular expression to check if a given string is a Vimeo URL
|
14
|
+
# Can also be used to extract the ID from the URL
|
15
|
+
# @return [Regexp]
|
16
|
+
def url_regexp
|
17
|
+
%r{
|
18
|
+
^\S* # all protocols & sub domains
|
19
|
+
vimeo\.com # domain
|
20
|
+
.*[\#/] # optional path
|
21
|
+
(?<id>[\d]+) # numerical id
|
22
|
+
}xi
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Import
|
3
|
+
module GoogleDoc
|
4
|
+
module HTML
|
5
|
+
class EmbeddedYoutubeVideoParser < EmbeddedParser
|
6
|
+
# The type of this embedded element
|
7
|
+
# @return [Symbol]
|
8
|
+
def embed_type
|
9
|
+
:youtube_video
|
10
|
+
end
|
11
|
+
|
12
|
+
class << self
|
13
|
+
# Regular expression to check if a given string is a Youtube URL
|
14
|
+
# Also used to extract the ID from the URL.
|
15
|
+
# @return [Regexp]
|
16
|
+
def url_regexp
|
17
|
+
%r{
|
18
|
+
^\S* # all protocols & sub domains
|
19
|
+
( # different domains / paths
|
20
|
+
youtube\.com/(
|
21
|
+
[^/]+/.+/|(v|e(mbed)?)/|.*[?&]v=
|
22
|
+
)|
|
23
|
+
youtu\.be/
|
24
|
+
)
|
25
|
+
(?<id>[a-zA-Z0-9_-]+) # alpha-numerical id, including _-
|
26
|
+
}xi
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Import
|
3
|
+
module GoogleDoc
|
4
|
+
module HTML
|
5
|
+
class HeadingParser
|
6
|
+
# @param [Nokogiri::HTML::Node] node
|
7
|
+
def initialize(node:)
|
8
|
+
@node = node
|
9
|
+
end
|
10
|
+
|
11
|
+
# The raw text content of the heading, without any markup
|
12
|
+
# @return [String]
|
13
|
+
def content
|
14
|
+
@node.inner_text
|
15
|
+
end
|
16
|
+
|
17
|
+
# Determine the level of the heading
|
18
|
+
# The level corresponds to the header tag, e.g. `<h3>` is level 3.
|
19
|
+
# @return [Integer]
|
20
|
+
def level
|
21
|
+
case @node.name
|
22
|
+
when 'h1' then 1
|
23
|
+
when 'h2' then 2
|
24
|
+
when 'h3' then 3
|
25
|
+
when 'h4' then 4
|
26
|
+
when 'h5' then 5
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# @return [ArticleJSON::Elements::Heading]
|
31
|
+
def element
|
32
|
+
ArticleJSON::Elements::Heading.new(level: level, content: content)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Import
|
3
|
+
module GoogleDoc
|
4
|
+
module HTML
|
5
|
+
class ImageParser
|
6
|
+
include Shared::Caption
|
7
|
+
include Shared::Float
|
8
|
+
|
9
|
+
# @param [Nokogiri::HTML::Node] node
|
10
|
+
# @param [Nokogiri::HTML::Node] caption_node
|
11
|
+
# @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer
|
12
|
+
def initialize(node:, caption_node:, css_analyzer:)
|
13
|
+
@node = node
|
14
|
+
@caption_node = caption_node
|
15
|
+
@css_analyzer = css_analyzer
|
16
|
+
|
17
|
+
# Main node indicates the floating behavior
|
18
|
+
@float_node = @node
|
19
|
+
end
|
20
|
+
|
21
|
+
# The value of the image's `src` attribute
|
22
|
+
# @return [String]
|
23
|
+
def source_url
|
24
|
+
image_node.attribute('src').value
|
25
|
+
end
|
26
|
+
|
27
|
+
# The node of the actual image
|
28
|
+
# @return [Nokogiri::HTML::Node]
|
29
|
+
def image_node
|
30
|
+
@node.xpath('.//img').first
|
31
|
+
end
|
32
|
+
|
33
|
+
# Check if the image is floating (left, right or not at all)
|
34
|
+
# @return [Symbol]
|
35
|
+
def float
|
36
|
+
super if floatable_size?
|
37
|
+
end
|
38
|
+
|
39
|
+
# @return [ArticleJSON::Elements::Image]
|
40
|
+
def element
|
41
|
+
ArticleJSON::Elements::Image.new(
|
42
|
+
source_url: source_url,
|
43
|
+
float: float,
|
44
|
+
caption: caption
|
45
|
+
)
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
|
50
|
+
# Check if the image's width can be determined and is less than 500px
|
51
|
+
# This is about 3/4 of the google document width...
|
52
|
+
# @return [Boolean]
|
53
|
+
def floatable_size?
|
54
|
+
image_width && image_width < 500
|
55
|
+
end
|
56
|
+
|
57
|
+
# Get the specified width of the image if available
|
58
|
+
# The width can either be specified in a width attribute or via style
|
59
|
+
# attribute. If not, `nil` is returned.
|
60
|
+
# @return [Integer]
|
61
|
+
def image_width
|
62
|
+
@image_width ||=
|
63
|
+
if image_node.has_attribute?('width')
|
64
|
+
image_node.attribute('width').value.to_i
|
65
|
+
elsif image_node.has_attribute?('style')
|
66
|
+
regex = /width:\s?(?<px>\d+|(\d+?\.\d+))px/
|
67
|
+
match = image_node.attribute('style').value.match(regex)
|
68
|
+
match['px'].to_i if match && match['px']
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Import
|
3
|
+
module GoogleDoc
|
4
|
+
module HTML
|
5
|
+
class ListParser
|
6
|
+
# @param [Nokogiri::HTML::Node] node
|
7
|
+
# @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer
|
8
|
+
def initialize(node:, css_analyzer:)
|
9
|
+
@node = node
|
10
|
+
@css_analyzer = css_analyzer
|
11
|
+
end
|
12
|
+
|
13
|
+
# Determine the list type, either ordered or unordered
|
14
|
+
# @return [Symbol]
|
15
|
+
def list_type
|
16
|
+
case @node.name
|
17
|
+
when 'ol' then :ordered
|
18
|
+
when 'ul' then :unordered
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# Parse the list's sub nodes to get a set of paragraphs
|
23
|
+
# @return [Array[ArticleJSON::Elements::Paragraph]]
|
24
|
+
def content
|
25
|
+
@node
|
26
|
+
.children
|
27
|
+
.select { |node| node.name == 'li' }
|
28
|
+
.map do |node|
|
29
|
+
ParagraphParser
|
30
|
+
.new(node: node, css_analyzer: @css_analyzer)
|
31
|
+
.element
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# @return [ArticleJSON::Elements::List]
|
36
|
+
def element
|
37
|
+
ArticleJSON::Elements::List.new(
|
38
|
+
list_type: list_type,
|
39
|
+
content: content
|
40
|
+
)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,111 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Import
|
3
|
+
module GoogleDoc
|
4
|
+
module HTML
|
5
|
+
class NodeAnalyzer
|
6
|
+
attr_reader :node
|
7
|
+
|
8
|
+
# @param [Nokogiri::HTML::Node] node
|
9
|
+
def initialize(node)
|
10
|
+
@node = node
|
11
|
+
end
|
12
|
+
|
13
|
+
# Check if a node equals a certain text
|
14
|
+
# @param [String] text
|
15
|
+
# @return [Boolean]
|
16
|
+
def has_text?(text)
|
17
|
+
node.inner_text.strip.downcase == text.strip.downcase
|
18
|
+
end
|
19
|
+
|
20
|
+
# Check if the node is empty, i.e. not containing any text
|
21
|
+
# Given that images are the only nodes without text, we have to make
|
22
|
+
# sure that it's not an image.
|
23
|
+
# @return [Boolean]
|
24
|
+
def empty?
|
25
|
+
return @is_empty if defined? @is_empty
|
26
|
+
@is_empty = node.inner_text.strip.empty? && !image? && !hr?
|
27
|
+
end
|
28
|
+
|
29
|
+
# Check if the node is a header tag between <h1> and <h5>
|
30
|
+
# @return [Boolean]
|
31
|
+
def heading?
|
32
|
+
return @is_heading if defined? @is_heading
|
33
|
+
@is_heading = %w(h1 h2 h3 h4 h5).include?(node.name)
|
34
|
+
end
|
35
|
+
|
36
|
+
# Check if the node is a horizontal line (i.e. `<hr>`)
|
37
|
+
# @return [Boolean]
|
38
|
+
def hr?
|
39
|
+
node.name == 'hr'
|
40
|
+
end
|
41
|
+
|
42
|
+
# Check if the node is a normal text paragraph
|
43
|
+
# @return [Boolean]
|
44
|
+
def paragraph?
|
45
|
+
return @is_paragraph if defined? @is_paragraph
|
46
|
+
@is_paragraph =
|
47
|
+
node.name == 'p' &&
|
48
|
+
!empty? &&
|
49
|
+
!image? &&
|
50
|
+
!text_box? &&
|
51
|
+
!quote? &&
|
52
|
+
!embed?
|
53
|
+
end
|
54
|
+
|
55
|
+
# Check if the node contains an ordered or unordered list
|
56
|
+
# @return [Boolean]
|
57
|
+
def list?
|
58
|
+
return @is_list if defined? @is_list
|
59
|
+
@is_list = %w(ul ol).include?(node.name)
|
60
|
+
end
|
61
|
+
|
62
|
+
# Check if the node starts a text box
|
63
|
+
# Text boxes start with a single line saying "Textbox:" or "Highlight:".
|
64
|
+
# @return [Boolean]
|
65
|
+
def text_box?
|
66
|
+
return @is_text_box if defined? @is_text_box
|
67
|
+
@is_text_box = has_text?('textbox:') || has_text?('highlight:')
|
68
|
+
end
|
69
|
+
|
70
|
+
# Check if the node starts a quote
|
71
|
+
# Quotes start with a single line saying "Quote:".
|
72
|
+
# @return [Boolean]
|
73
|
+
def quote?
|
74
|
+
return @is_quote if defined? @is_quote
|
75
|
+
@is_quote = has_text?('quote:')
|
76
|
+
end
|
77
|
+
|
78
|
+
# Check if the node contains an image
|
79
|
+
# @return [Boolean]
|
80
|
+
def image?
|
81
|
+
return @is_image if defined? @is_image
|
82
|
+
@is_image = node.xpath('.//img').length > 0
|
83
|
+
end
|
84
|
+
|
85
|
+
# Check if the node contains an embedded element
|
86
|
+
# @return [Boolean]
|
87
|
+
def embed?
|
88
|
+
return @is_embed if defined? @is_embed
|
89
|
+
@is_embed = EmbeddedParser.supported?(node)
|
90
|
+
end
|
91
|
+
|
92
|
+
# Determine the type of this node
|
93
|
+
# The type is one of the elements supported by article_json.
|
94
|
+
# @return [Symbol]
|
95
|
+
def type
|
96
|
+
return :empty if empty?
|
97
|
+
return :hr if hr?
|
98
|
+
return :heading if heading?
|
99
|
+
return :paragraph if paragraph?
|
100
|
+
return :list if list?
|
101
|
+
return :text_box if text_box?
|
102
|
+
return :quote if quote?
|
103
|
+
return :image if image?
|
104
|
+
return :embed if embed?
|
105
|
+
:unknown
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Import
|
3
|
+
module GoogleDoc
|
4
|
+
module HTML
|
5
|
+
class ParagraphParser
|
6
|
+
# @param [Nokogiri::HTML::Node] node
|
7
|
+
# @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer
|
8
|
+
def initialize(node:, css_analyzer:)
|
9
|
+
@node = node
|
10
|
+
@css_analyzer = css_analyzer
|
11
|
+
end
|
12
|
+
|
13
|
+
# @return [Array[ArticleJSON::Elements::Text]]
|
14
|
+
def content
|
15
|
+
TextParser.extract(node: @node, css_analyzer: @css_analyzer)
|
16
|
+
end
|
17
|
+
|
18
|
+
# @return [ArticleJSON::Elements::Paragraph]
|
19
|
+
def element
|
20
|
+
ArticleJSON::Elements::Paragraph.new(content: content)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Import
|
3
|
+
module GoogleDoc
|
4
|
+
module HTML
|
5
|
+
class Parser
|
6
|
+
# @param [String] html
|
7
|
+
def initialize(html)
|
8
|
+
doc = Nokogiri::HTML(html)
|
9
|
+
@body_enumerator = doc.xpath('//body').last.children.to_enum
|
10
|
+
|
11
|
+
css_node = doc.xpath('//head/style').last
|
12
|
+
@css_analyzer = CSSAnalyzer.new(css_node&.inner_text)
|
13
|
+
end
|
14
|
+
|
15
|
+
# Parse the body of the document and return the result
|
16
|
+
# @return [Array[ArticleJSON::Elements::Base]]
|
17
|
+
def parsed_content
|
18
|
+
@parsed_content ||= parse_body
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
# Loop over all body nodes and parse them
|
24
|
+
# @return [Array[ArticleJSON::Elements::Base]]
|
25
|
+
def parse_body
|
26
|
+
@parsed_content = []
|
27
|
+
while body_has_more_nodes?
|
28
|
+
@parsed_content << begin
|
29
|
+
@current_node = NodeAnalyzer.new(@body_enumerator.next)
|
30
|
+
parse_current_node || next
|
31
|
+
end
|
32
|
+
end
|
33
|
+
@parsed_content
|
34
|
+
end
|
35
|
+
|
36
|
+
# Parse the current node and return an element, if available
|
37
|
+
# @return [ArticleJSON::Elements::Base]
|
38
|
+
def parse_current_node
|
39
|
+
case @current_node.type
|
40
|
+
when :heading then parse_heading
|
41
|
+
when :paragraph then parse_paragraph
|
42
|
+
when :list then parse_list
|
43
|
+
when :image then parse_image
|
44
|
+
when :text_box then parse_text_box
|
45
|
+
when :quote then parse_quote
|
46
|
+
when :embed then parse_embed
|
47
|
+
when :hr, :empty, :unknown then nil
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# @return [ArticleJSON::Elements::Heading]
|
52
|
+
def parse_heading
|
53
|
+
HeadingParser.new(node: @current_node.node).element
|
54
|
+
end
|
55
|
+
|
56
|
+
# @return [ArticleJSON::Elements::Paragraph]
|
57
|
+
def parse_paragraph
|
58
|
+
ParagraphParser
|
59
|
+
.new(node: @current_node.node, css_analyzer: @css_analyzer)
|
60
|
+
.element
|
61
|
+
end
|
62
|
+
|
63
|
+
# @return [ArticleJSON::Elements::List]
|
64
|
+
def parse_list
|
65
|
+
ListParser
|
66
|
+
.new(node: @current_node.node, css_analyzer: @css_analyzer)
|
67
|
+
.element
|
68
|
+
end
|
69
|
+
|
70
|
+
# @return [ArticleJSON::Elements::Image]
|
71
|
+
def parse_image
|
72
|
+
ImageParser
|
73
|
+
.new(
|
74
|
+
node: @current_node.node,
|
75
|
+
caption_node: @body_enumerator.next,
|
76
|
+
css_analyzer: @css_analyzer
|
77
|
+
)
|
78
|
+
.element
|
79
|
+
end
|
80
|
+
|
81
|
+
# @return [ArticleJSON::Elements::TextBox]
|
82
|
+
def parse_text_box
|
83
|
+
TextBoxParser
|
84
|
+
.new(nodes: nodes_until_hr, css_analyzer: @css_analyzer)
|
85
|
+
.element
|
86
|
+
end
|
87
|
+
|
88
|
+
# @return [ArticleJSON::Elements::Quote]
|
89
|
+
def parse_quote
|
90
|
+
QuoteParser
|
91
|
+
.new(nodes: nodes_until_hr, css_analyzer: @css_analyzer)
|
92
|
+
.element
|
93
|
+
end
|
94
|
+
|
95
|
+
# @return [ArticleJSON::Elements::Embed]
|
96
|
+
def parse_embed
|
97
|
+
EmbeddedParser.build(
|
98
|
+
node: @current_node.node,
|
99
|
+
caption_node: @body_enumerator.next,
|
100
|
+
css_analyzer: @css_analyzer
|
101
|
+
)
|
102
|
+
end
|
103
|
+
|
104
|
+
# Collect all nodes until a horizontal line, advancing the enumerator
|
105
|
+
# @return [Array[Nokogiri::HTML::Node]]
|
106
|
+
def nodes_until_hr
|
107
|
+
nodes = []
|
108
|
+
until NodeAnalyzer.new(@body_enumerator.peek).hr?
|
109
|
+
nodes << @body_enumerator.next
|
110
|
+
end
|
111
|
+
nodes
|
112
|
+
end
|
113
|
+
|
114
|
+
# @return [Boolean]
|
115
|
+
def body_has_more_nodes?
|
116
|
+
@body_enumerator.peek
|
117
|
+
true
|
118
|
+
rescue StopIteration
|
119
|
+
false
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Import
|
3
|
+
module GoogleDoc
|
4
|
+
module HTML
|
5
|
+
class QuoteParser
|
6
|
+
include Shared::Caption
|
7
|
+
include Shared::Float
|
8
|
+
|
9
|
+
# @param [Array[Nokogiri::HTML::Node]] nodes
|
10
|
+
# @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer
|
11
|
+
def initialize(nodes:, css_analyzer:)
|
12
|
+
@nodes = nodes.reject { |node| NodeAnalyzer.new(node).empty? }
|
13
|
+
@css_analyzer = css_analyzer
|
14
|
+
|
15
|
+
# First node of the quote indicates floating behavior
|
16
|
+
@float_node = @nodes.first
|
17
|
+
# Last node of the quote contains the caption
|
18
|
+
@caption_node = @nodes.last
|
19
|
+
end
|
20
|
+
|
21
|
+
# Parse the quote's nodes to get a set of paragraphs
|
22
|
+
# The last node is ignored as it contains the quote caption
|
23
|
+
# @return [Array[ArticleJSON::Elements::Paragraph]]
|
24
|
+
def content
|
25
|
+
@nodes
|
26
|
+
.take(@nodes.size - 1)
|
27
|
+
.map do |node|
|
28
|
+
ParagraphParser
|
29
|
+
.new(node: node, css_analyzer: @css_analyzer)
|
30
|
+
.element
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# @return [ArticleJSON::Elements::Quote]
|
35
|
+
def element
|
36
|
+
ArticleJSON::Elements::Quote.new(
|
37
|
+
content: content,
|
38
|
+
caption: caption,
|
39
|
+
float: float
|
40
|
+
)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Import
|
3
|
+
module GoogleDoc
|
4
|
+
module HTML
|
5
|
+
module Shared
|
6
|
+
module Caption
|
7
|
+
# Parse the caption node
|
8
|
+
# @return [Array[ArticleJSON::Elements::Text]]
|
9
|
+
def caption
|
10
|
+
ArticleJSON::Import::GoogleDoc::HTML::TextParser.extract(
|
11
|
+
node: @caption_node,
|
12
|
+
css_analyzer: @css_analyzer
|
13
|
+
)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Import
|
3
|
+
module GoogleDoc
|
4
|
+
module HTML
|
5
|
+
module Shared
|
6
|
+
module Float
|
7
|
+
# Check if the quote is floating (left, right or not at all)
|
8
|
+
# @return [Symbol]
|
9
|
+
def float
|
10
|
+
return unless @float_node.has_attribute?('class')
|
11
|
+
node_class = @float_node.attribute('class').value || ''
|
12
|
+
return :right if @css_analyzer.right_aligned?(node_class)
|
13
|
+
return :left if @css_analyzer.left_aligned?(node_class)
|
14
|
+
nil
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Import
|
3
|
+
module GoogleDoc
|
4
|
+
module HTML
|
5
|
+
class TextBoxParser
|
6
|
+
include Shared::Float
|
7
|
+
|
8
|
+
# @param [Array[Nokogiri::HTML::Node]] nodes
|
9
|
+
# @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer
|
10
|
+
def initialize(nodes:, css_analyzer:)
|
11
|
+
@nodes = nodes.reject { |node| NodeAnalyzer.new(node).empty? }
|
12
|
+
@css_analyzer = css_analyzer
|
13
|
+
|
14
|
+
# First node of the text box indicates floating behavior
|
15
|
+
@float_node = @nodes.first
|
16
|
+
end
|
17
|
+
|
18
|
+
# Parse the text box's nodes to get a list of sub elements
|
19
|
+
# Supported sub elements are: headings, paragraphs & lists.
|
20
|
+
# @return [Array]
|
21
|
+
def content
|
22
|
+
@nodes.map { |node| parse_sub_node(node) }.compact
|
23
|
+
end
|
24
|
+
|
25
|
+
# Hash representation of this text box
|
26
|
+
# @return [ArticleJSON::Elements::TextBox]
|
27
|
+
def element
|
28
|
+
ArticleJSON::Elements::TextBox.new(float: float, content: content)
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def parse_sub_node(node)
|
34
|
+
case NodeAnalyzer.new(node).type
|
35
|
+
when :heading
|
36
|
+
HeadingParser.new(node: node).element
|
37
|
+
when :paragraph
|
38
|
+
ParagraphParser
|
39
|
+
.new(node: node, css_analyzer: @css_analyzer)
|
40
|
+
.element
|
41
|
+
when :list
|
42
|
+
ListParser.new(node: node, css_analyzer: @css_analyzer).element
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|