article_json 0.3.8 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +53 -0
- data/README.md +127 -79
- data/bin/article_json_export_amp.rb +1 -0
- data/bin/{article_json_export_facebook.rb → article_json_export_apple_news.rb} +4 -4
- data/bin/article_json_export_html.rb +1 -0
- data/bin/article_json_export_plain_text.rb +1 -0
- data/bin/article_json_parse_google_doc.rb +1 -0
- data/bin/check_google_doc_export.rb +41 -0
- data/bin/update_oembed_request-stubs.sh +1 -3
- data/bin/update_reference_document.sh +3 -3
- data/lib/article_json/article.rb +17 -9
- data/lib/article_json/configuration.rb +6 -5
- data/lib/article_json/elements/base.rb +0 -1
- data/lib/article_json/elements/heading.rb +0 -1
- data/lib/article_json/elements/image.rb +0 -1
- data/lib/article_json/elements/list.rb +0 -1
- data/lib/article_json/elements/paragraph.rb +1 -1
- data/lib/article_json/elements/quote.rb +0 -1
- data/lib/article_json/elements/text.rb +1 -1
- data/lib/article_json/elements/text_box.rb +0 -1
- data/lib/article_json/export/amp/custom_element_library_resolver.rb +0 -1
- data/lib/article_json/export/amp/elements/embed.rb +43 -31
- data/lib/article_json/export/amp/elements/image.rb +7 -5
- data/lib/article_json/export/amp/exporter.rb +4 -2
- data/lib/article_json/export/apple_news/elements/base.rb +53 -0
- data/lib/article_json/export/apple_news/elements/embed.rb +130 -0
- data/lib/article_json/export/apple_news/elements/heading.rb +32 -0
- data/lib/article_json/export/apple_news/elements/image.rb +59 -0
- data/lib/article_json/export/apple_news/elements/list.rb +67 -0
- data/lib/article_json/export/apple_news/elements/paragraph.rb +36 -0
- data/lib/article_json/export/apple_news/elements/quote.rb +60 -0
- data/lib/article_json/export/apple_news/elements/text.rb +55 -0
- data/lib/article_json/export/apple_news/elements/text_box.rb +51 -0
- data/lib/article_json/export/apple_news/exporter.rb +37 -0
- data/lib/article_json/export/common/html/elements/embed.rb +2 -1
- data/lib/article_json/export/common/html/elements/image.rb +2 -1
- data/lib/article_json/export/common/html/elements/text.rb +2 -0
- data/lib/article_json/import/google_doc/html/embedded_parser.rb +1 -0
- data/lib/article_json/import/google_doc/html/heading_parser.rb +5 -5
- data/lib/article_json/import/google_doc/html/image_parser.rb +18 -2
- data/lib/article_json/import/google_doc/html/list_parser.rb +2 -2
- data/lib/article_json/import/google_doc/html/node_analyzer.rb +25 -3
- data/lib/article_json/import/google_doc/html/parser.rb +7 -1
- data/lib/article_json/import/google_doc/html/shared/caption.rb +1 -0
- data/lib/article_json/import/google_doc/html/shared/float.rb +2 -0
- data/lib/article_json/import/google_doc/html/text_box_parser.rb +2 -1
- data/lib/article_json/import/google_doc/html/text_parser.rb +2 -0
- data/lib/article_json/utils/additional_element_placer.rb +2 -0
- data/lib/article_json/utils/o_embed_resolver/base.rb +14 -4
- data/lib/article_json/utils/o_embed_resolver/facebook_video.rb +17 -1
- data/lib/article_json/utils/o_embed_resolver/slideshare.rb +2 -2
- data/lib/article_json/utils/o_embed_resolver/youtube_video.rb +14 -1
- data/lib/article_json/version.rb +1 -1
- data/lib/article_json.rb +11 -11
- metadata +29 -26
- data/lib/article_json/export/facebook_instant_article/elements/base.rb +0 -30
- data/lib/article_json/export/facebook_instant_article/elements/embed.rb +0 -44
- data/lib/article_json/export/facebook_instant_article/elements/heading.rb +0 -11
- data/lib/article_json/export/facebook_instant_article/elements/image.rb +0 -11
- data/lib/article_json/export/facebook_instant_article/elements/list.rb +0 -11
- data/lib/article_json/export/facebook_instant_article/elements/paragraph.rb +0 -11
- data/lib/article_json/export/facebook_instant_article/elements/quote.rb +0 -30
- data/lib/article_json/export/facebook_instant_article/elements/text.rb +0 -11
- data/lib/article_json/export/facebook_instant_article/elements/text_box.rb +0 -40
- data/lib/article_json/export/facebook_instant_article/exporter.rb +0 -17
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
module ArticleJSON
|
|
2
|
+
module Export
|
|
3
|
+
module AppleNews
|
|
4
|
+
module Elements
|
|
5
|
+
class Paragraph < Base
|
|
6
|
+
# Generate the paragraph node with its containing text elements
|
|
7
|
+
# @return [Hash]
|
|
8
|
+
def export
|
|
9
|
+
{
|
|
10
|
+
role: 'body',
|
|
11
|
+
text: text,
|
|
12
|
+
format: 'html',
|
|
13
|
+
layout: 'bodyLayout',
|
|
14
|
+
textStyle: 'bodyStyle',
|
|
15
|
+
}
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
private
|
|
19
|
+
|
|
20
|
+
# Get the exporter class for text elements
|
|
21
|
+
# @return [ArticleJSON::Export::Common::HTML::Elements::Base]
|
|
22
|
+
def text_exporter
|
|
23
|
+
self.class.exporter_by_type(:text)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def text
|
|
27
|
+
@element.content.map do |child_element|
|
|
28
|
+
text_exporter.new(child_element)
|
|
29
|
+
.export
|
|
30
|
+
end.join
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
module ArticleJSON
|
|
2
|
+
module Export
|
|
3
|
+
module AppleNews
|
|
4
|
+
module Elements
|
|
5
|
+
class Quote < Base
|
|
6
|
+
include ArticleJSON::Export::Common::HTML::Elements::Base
|
|
7
|
+
include ArticleJSON::Export::Common::HTML::Elements::Text
|
|
8
|
+
|
|
9
|
+
def export
|
|
10
|
+
[quote, author]
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
private
|
|
14
|
+
|
|
15
|
+
# Quote
|
|
16
|
+
# @return [Hash]
|
|
17
|
+
def quote
|
|
18
|
+
{
|
|
19
|
+
role: 'pullquote',
|
|
20
|
+
text: quote_text,
|
|
21
|
+
format: 'html',
|
|
22
|
+
layout: 'pullquoteLayout',
|
|
23
|
+
textStyle: 'pullquoteStyle',
|
|
24
|
+
}
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Author
|
|
28
|
+
# @return [Hash]
|
|
29
|
+
def author
|
|
30
|
+
{
|
|
31
|
+
role: 'author',
|
|
32
|
+
text: author_text,
|
|
33
|
+
format: 'html',
|
|
34
|
+
layout: 'pullquoteAttributeLayout',
|
|
35
|
+
textStyle: 'quoteAttributeStyle',
|
|
36
|
+
}
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def text_exporter
|
|
40
|
+
self.class.exporter_by_type(:text)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Quote Text
|
|
44
|
+
# @return [String]
|
|
45
|
+
def quote_text
|
|
46
|
+
element = @element.content.first&.content.first
|
|
47
|
+
text_exporter.new(element).export
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Author Text
|
|
51
|
+
# @return [String]
|
|
52
|
+
def author_text
|
|
53
|
+
element = @element.caption.first
|
|
54
|
+
text_exporter.new(element).export
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
module ArticleJSON
|
|
2
|
+
module Export
|
|
3
|
+
module AppleNews
|
|
4
|
+
module Elements
|
|
5
|
+
class Text < Base
|
|
6
|
+
include ArticleJSON::Export::Common::HTML::Elements::Base
|
|
7
|
+
include ArticleJSON::Export::Common::HTML::Elements::Text
|
|
8
|
+
|
|
9
|
+
UNSUPPORTED_HTML_TAGS = %w[
|
|
10
|
+
title
|
|
11
|
+
meta
|
|
12
|
+
script
|
|
13
|
+
noscript
|
|
14
|
+
style
|
|
15
|
+
link
|
|
16
|
+
applet
|
|
17
|
+
object
|
|
18
|
+
iframe
|
|
19
|
+
noframes
|
|
20
|
+
form
|
|
21
|
+
select
|
|
22
|
+
option
|
|
23
|
+
optgroup
|
|
24
|
+
].freeze
|
|
25
|
+
|
|
26
|
+
# A Nokogiri object is returned with`super`, which is is then
|
|
27
|
+
# returned as a either a string or as HTML (when not plain text),
|
|
28
|
+
# both of which are compatible with Apple News format. Takes into
|
|
29
|
+
# account bold, italic and href.
|
|
30
|
+
# @return [String]
|
|
31
|
+
def export
|
|
32
|
+
super.to_s
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# @param [String] text
|
|
36
|
+
def create_text_nodes(text)
|
|
37
|
+
Nokogiri::HTML.fragment(sanitize_text(text).gsub(/\n/, '<br>')).children
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Removes UNSUPPORTED_TAGS from text
|
|
41
|
+
#
|
|
42
|
+
# @param [String] text
|
|
43
|
+
# @return [String]
|
|
44
|
+
def sanitize_text(text)
|
|
45
|
+
doc = Nokogiri::HTML.fragment(text)
|
|
46
|
+
UNSUPPORTED_HTML_TAGS.each do |tag|
|
|
47
|
+
doc.search(tag).each(&:remove)
|
|
48
|
+
end
|
|
49
|
+
doc.inner_html
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
module ArticleJSON
|
|
2
|
+
module Export
|
|
3
|
+
module AppleNews
|
|
4
|
+
module Elements
|
|
5
|
+
class TextBox < Base
|
|
6
|
+
include ArticleJSON::Export::Common::HTML::Elements::TextBox
|
|
7
|
+
# List
|
|
8
|
+
# @return [Hash]
|
|
9
|
+
def export
|
|
10
|
+
{
|
|
11
|
+
role: 'container',
|
|
12
|
+
layout: 'textBoxLayout',
|
|
13
|
+
style: 'textBoxStyle',
|
|
14
|
+
components: map_styles(elements),
|
|
15
|
+
}
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
private
|
|
19
|
+
|
|
20
|
+
# @return [Array]
|
|
21
|
+
def elements
|
|
22
|
+
@element.content.map do |child_element|
|
|
23
|
+
case child_element
|
|
24
|
+
when ArticleJSON::Elements::Heading
|
|
25
|
+
namespace::Heading.new(child_element).export
|
|
26
|
+
when ArticleJSON::Elements::Paragraph
|
|
27
|
+
namespace::Paragraph.new(child_element).export
|
|
28
|
+
when ArticleJSON::Elements::List
|
|
29
|
+
namespace::List.new(child_element).export
|
|
30
|
+
else
|
|
31
|
+
namespace::Text.new(child_element).export
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# @return [Module]
|
|
37
|
+
def namespace
|
|
38
|
+
ArticleJSON::Export::AppleNews::Elements
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# @return [Array]
|
|
42
|
+
def map_styles(elements)
|
|
43
|
+
elements.map do |child_element|
|
|
44
|
+
child_element.merge(layout: 'textBox' + child_element[:layout].sub(/\S/, &:upcase))
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
module ArticleJSON
|
|
2
|
+
module Export
|
|
3
|
+
module AppleNews
|
|
4
|
+
class Exporter
|
|
5
|
+
# @param [Array[ArticleJSON::Elements::Base]] elements
|
|
6
|
+
def initialize(elements)
|
|
7
|
+
@elements = elements
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
# Return the components section of an Apple News Article as JSON
|
|
11
|
+
#
|
|
12
|
+
# Images and EmbededVideos are nested in an array with the components
|
|
13
|
+
# array when they contain captions. As Apple News skips over these
|
|
14
|
+
# nested arrays, we must flatten the array.
|
|
15
|
+
#
|
|
16
|
+
# @return [String]
|
|
17
|
+
def to_json
|
|
18
|
+
{ components: components.flatten }.to_json
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
private
|
|
22
|
+
|
|
23
|
+
# Generate an array with the plain text representation of all elements
|
|
24
|
+
#
|
|
25
|
+
# @return [Array]
|
|
26
|
+
def components
|
|
27
|
+
@components ||=
|
|
28
|
+
@elements.map do |element|
|
|
29
|
+
ArticleJSON::Export::AppleNews::Elements::Base
|
|
30
|
+
.build(element)
|
|
31
|
+
&.export
|
|
32
|
+
end.reject { |hash| hash.nil? || hash.empty? }
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -20,7 +20,7 @@ module ArticleJSON
|
|
|
20
20
|
private
|
|
21
21
|
|
|
22
22
|
def embed_node
|
|
23
|
-
type = @element.embed_type.to_s.tr('_','-')
|
|
23
|
+
type = @element.embed_type.to_s.tr('_', '-')
|
|
24
24
|
create_element(:div, class: "embed #{type}") do |div|
|
|
25
25
|
div.add_child(embedded_object)
|
|
26
26
|
end
|
|
@@ -28,6 +28,7 @@ module ArticleJSON
|
|
|
28
28
|
|
|
29
29
|
def embedded_object
|
|
30
30
|
return unavailable_node unless @element.oembed_data
|
|
31
|
+
|
|
31
32
|
Nokogiri::HTML.fragment(@element.oembed_data[:html])
|
|
32
33
|
end
|
|
33
34
|
|
|
@@ -19,7 +19,7 @@ module ArticleJSON
|
|
|
19
19
|
# @return [Nokogiri::XML::NodeSet]
|
|
20
20
|
def figure_node
|
|
21
21
|
create_element(:figure, node_opts) do |figure|
|
|
22
|
-
node =
|
|
22
|
+
node = @element&.href ? href_node : image_node
|
|
23
23
|
figure.add_child(node)
|
|
24
24
|
if @element.caption&.any?
|
|
25
25
|
figure.add_child(caption_node(:figcaption))
|
|
@@ -42,6 +42,7 @@ module ArticleJSON
|
|
|
42
42
|
# @return [Hash]
|
|
43
43
|
def node_opts
|
|
44
44
|
return if floating_class.nil?
|
|
45
|
+
|
|
45
46
|
{ class: floating_class }
|
|
46
47
|
end
|
|
47
48
|
end
|
|
@@ -11,6 +11,7 @@ module ArticleJSON
|
|
|
11
11
|
return bold_and_italic_node if @element.bold && @element.italic
|
|
12
12
|
return bold_node if @element.bold
|
|
13
13
|
return italic_node if @element.italic
|
|
14
|
+
|
|
14
15
|
content_node
|
|
15
16
|
end
|
|
16
17
|
|
|
@@ -38,6 +39,7 @@ module ArticleJSON
|
|
|
38
39
|
# @return [Nokogiri::XML::NodeSet]
|
|
39
40
|
def content_node
|
|
40
41
|
return create_text_nodes(@element.content) if @element.href.nil?
|
|
42
|
+
|
|
41
43
|
create_element(:a, href: @element.href) do |a|
|
|
42
44
|
a.add_child(create_text_nodes(@element.content))
|
|
43
45
|
end
|
|
@@ -19,11 +19,11 @@ module ArticleJSON
|
|
|
19
19
|
# @return [Integer]
|
|
20
20
|
def level
|
|
21
21
|
case @node.name
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
22
|
+
when 'h1' then 1
|
|
23
|
+
when 'h2' then 2
|
|
24
|
+
when 'h3' then 3
|
|
25
|
+
when 'h4' then 4
|
|
26
|
+
when 'h5' then 5
|
|
27
27
|
end
|
|
28
28
|
end
|
|
29
29
|
|
|
@@ -22,25 +22,31 @@ module ArticleJSON
|
|
|
22
22
|
# The value of the image's `alt` attribute
|
|
23
23
|
# @return [String]
|
|
24
24
|
def alt
|
|
25
|
+
return '' if image_url?
|
|
26
|
+
|
|
25
27
|
image_node.attribute('alt')&.value || ''
|
|
26
28
|
end
|
|
27
29
|
|
|
28
30
|
# The value of the image's `src` attribute
|
|
29
31
|
# @return [String]
|
|
30
32
|
def source_url
|
|
33
|
+
return @node.inner_text.strip if image_url?
|
|
34
|
+
|
|
31
35
|
image_node.attribute('src').value
|
|
32
36
|
end
|
|
33
37
|
|
|
34
38
|
# The node of the actual image
|
|
35
39
|
# @return [Nokogiri::HTML::Node]
|
|
36
40
|
def image_node
|
|
37
|
-
@
|
|
41
|
+
return @image_node if defined? @image_node
|
|
42
|
+
|
|
43
|
+
@image_node = @node.xpath('.//img').first
|
|
38
44
|
end
|
|
39
45
|
|
|
40
46
|
# Check if the image is floating (left, right or not at all)
|
|
41
47
|
# @return [Symbol]
|
|
42
48
|
def float
|
|
43
|
-
super if floatable_size?
|
|
49
|
+
super if image_url? || floatable_size?
|
|
44
50
|
end
|
|
45
51
|
|
|
46
52
|
# Extracts an href from the tag [image-link-to: url]) if present
|
|
@@ -48,8 +54,10 @@ module ArticleJSON
|
|
|
48
54
|
# @return [String]
|
|
49
55
|
def href
|
|
50
56
|
return if @caption_node.nil?
|
|
57
|
+
|
|
51
58
|
match = @caption_node.content.strip.match(href_regexp)
|
|
52
59
|
return if match.nil?
|
|
60
|
+
|
|
53
61
|
remove_image_link_tag
|
|
54
62
|
match[:url]
|
|
55
63
|
end
|
|
@@ -80,6 +88,7 @@ module ArticleJSON
|
|
|
80
88
|
def href_regexp
|
|
81
89
|
%r{\[image-link-to:\s+(?<url>.*?)\]}
|
|
82
90
|
end
|
|
91
|
+
|
|
83
92
|
# Check if the image's width can be determined and is less than 500px
|
|
84
93
|
# This is about 3/4 of the google document width...
|
|
85
94
|
# @return [Boolean]
|
|
@@ -101,6 +110,13 @@ module ArticleJSON
|
|
|
101
110
|
match['px'].to_i if match && match['px']
|
|
102
111
|
end
|
|
103
112
|
end
|
|
113
|
+
|
|
114
|
+
# When the current node doesn't contain an actual image tag,
|
|
115
|
+
# we're dealing with an image URL
|
|
116
|
+
# @return [Boolean]
|
|
117
|
+
def image_url?
|
|
118
|
+
image_node.nil?
|
|
119
|
+
end
|
|
104
120
|
end
|
|
105
121
|
end
|
|
106
122
|
end
|
|
@@ -31,6 +31,7 @@ module ArticleJSON
|
|
|
31
31
|
# @return [Boolean]
|
|
32
32
|
def empty?
|
|
33
33
|
return @is_empty if defined? @is_empty
|
|
34
|
+
|
|
34
35
|
@is_empty = node.inner_text.strip.empty? && !image? && !hr? && !br?
|
|
35
36
|
end
|
|
36
37
|
|
|
@@ -38,6 +39,7 @@ module ArticleJSON
|
|
|
38
39
|
# @return [Boolean]
|
|
39
40
|
def heading?
|
|
40
41
|
return @is_heading if defined? @is_heading
|
|
42
|
+
|
|
41
43
|
@is_heading =
|
|
42
44
|
!quote? && !text_box? && %w(h1 h2 h3 h4 h5).include?(node.name)
|
|
43
45
|
end
|
|
@@ -52,6 +54,7 @@ module ArticleJSON
|
|
|
52
54
|
# @return [Boolean]
|
|
53
55
|
def paragraph?
|
|
54
56
|
return @is_paragraph if defined? @is_paragraph
|
|
57
|
+
|
|
55
58
|
@is_paragraph =
|
|
56
59
|
node.name == 'p' &&
|
|
57
60
|
!empty? &&
|
|
@@ -65,7 +68,8 @@ module ArticleJSON
|
|
|
65
68
|
# @return [Boolean]
|
|
66
69
|
def list?
|
|
67
70
|
return @is_list if defined? @is_list
|
|
68
|
-
|
|
71
|
+
|
|
72
|
+
@is_list = %w[ul ol].include?(node.name)
|
|
69
73
|
end
|
|
70
74
|
|
|
71
75
|
# Check if the node starts a text box
|
|
@@ -73,6 +77,7 @@ module ArticleJSON
|
|
|
73
77
|
# @return [Boolean]
|
|
74
78
|
def text_box?
|
|
75
79
|
return @is_text_box if defined? @is_text_box
|
|
80
|
+
|
|
76
81
|
@is_text_box = begins_with?('textbox:') || begins_with?('highlight:')
|
|
77
82
|
end
|
|
78
83
|
|
|
@@ -81,6 +86,7 @@ module ArticleJSON
|
|
|
81
86
|
# @return [Boolean]
|
|
82
87
|
def quote?
|
|
83
88
|
return @is_quote if defined? @is_quote
|
|
89
|
+
|
|
84
90
|
@is_quote = has_text?('quote:')
|
|
85
91
|
end
|
|
86
92
|
|
|
@@ -88,13 +94,25 @@ module ArticleJSON
|
|
|
88
94
|
# @return [Boolean]
|
|
89
95
|
def image?
|
|
90
96
|
return @is_image if defined? @is_image
|
|
91
|
-
|
|
97
|
+
|
|
98
|
+
@is_image = image_url? || node.xpath('.//img').length > 0
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Check if the node contains an image URL
|
|
102
|
+
# @return [Boolean]
|
|
103
|
+
def image_url?
|
|
104
|
+
return @is_image_url if defined? @is_image_url
|
|
105
|
+
|
|
106
|
+
text = node.inner_text.strip
|
|
107
|
+
url_regexp = %r{https?:\/\/\S+\.(?:jpg|jpeg|png|gif)}i
|
|
108
|
+
@is_image_url = !!(url_regexp =~ text)
|
|
92
109
|
end
|
|
93
110
|
|
|
94
111
|
# Check if the node contains an embedded element
|
|
95
112
|
# @return [Boolean]
|
|
96
113
|
def embed?
|
|
97
114
|
return @is_embed if defined? @is_embed
|
|
115
|
+
|
|
98
116
|
@is_embed = EmbeddedParser.supported?(node)
|
|
99
117
|
end
|
|
100
118
|
|
|
@@ -103,6 +121,7 @@ module ArticleJSON
|
|
|
103
121
|
# @return [Boolean]
|
|
104
122
|
def br?
|
|
105
123
|
return @is_br if defined? @is_br
|
|
124
|
+
|
|
106
125
|
@is_br = node.name == 'br' || only_includes_brs?
|
|
107
126
|
end
|
|
108
127
|
|
|
@@ -119,6 +138,7 @@ module ArticleJSON
|
|
|
119
138
|
return :quote if quote?
|
|
120
139
|
return :image if image?
|
|
121
140
|
return :embed if embed?
|
|
141
|
+
|
|
122
142
|
:unknown
|
|
123
143
|
end
|
|
124
144
|
|
|
@@ -128,9 +148,11 @@ module ArticleJSON
|
|
|
128
148
|
# @return [Boolean]
|
|
129
149
|
def only_includes_brs?
|
|
130
150
|
return false unless node.inner_text.strip.empty?
|
|
151
|
+
|
|
131
152
|
tags = node.children.map(&:name)
|
|
132
153
|
# Check if it only contains <br> and text nodes
|
|
133
|
-
return false unless tags.all? { |tag| %w
|
|
154
|
+
return false unless tags.all? { |tag| %w[br text].include? tag }
|
|
155
|
+
|
|
134
156
|
# Check if at least one is a `<br>` node
|
|
135
157
|
tags.include?('br')
|
|
136
158
|
end
|
|
@@ -6,7 +6,12 @@ module ArticleJSON
|
|
|
6
6
|
# @param [String] html
|
|
7
7
|
def initialize(html)
|
|
8
8
|
doc = Nokogiri::HTML(html)
|
|
9
|
-
|
|
9
|
+
selection = if doc.xpath('//body/div').empty?
|
|
10
|
+
doc.xpath('//body')
|
|
11
|
+
else
|
|
12
|
+
doc.xpath('//body/div')
|
|
13
|
+
end
|
|
14
|
+
@body_enumerator = selection.last.children.to_enum
|
|
10
15
|
|
|
11
16
|
css_node = doc.xpath('//head/style').last
|
|
12
17
|
@css_analyzer = CSSAnalyzer.new(css_node&.inner_text)
|
|
@@ -113,6 +118,7 @@ module ArticleJSON
|
|
|
113
118
|
nodes = []
|
|
114
119
|
until !body_has_more_nodes? ||
|
|
115
120
|
NodeAnalyzer.new(@body_enumerator.peek).hr?
|
|
121
|
+
|
|
116
122
|
nodes << @body_enumerator.next
|
|
117
123
|
end
|
|
118
124
|
nodes
|
|
@@ -8,9 +8,11 @@ module ArticleJSON
|
|
|
8
8
|
# @return [Symbol]
|
|
9
9
|
def float
|
|
10
10
|
return unless @float_node.has_attribute?('class')
|
|
11
|
+
|
|
11
12
|
node_class = @float_node.attribute('class').value || ''
|
|
12
13
|
return :right if @css_analyzer.right_aligned?(node_class)
|
|
13
14
|
return :left if @css_analyzer.left_aligned?(node_class)
|
|
15
|
+
|
|
14
16
|
nil
|
|
15
17
|
end
|
|
16
18
|
end
|
|
@@ -10,7 +10,7 @@ module ArticleJSON
|
|
|
10
10
|
# May contain tags, too.
|
|
11
11
|
# @param [Array[Nokogiri::HTML::Node]] nodes
|
|
12
12
|
# @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer
|
|
13
|
-
def initialize(type_node
|
|
13
|
+
def initialize(type_node:, nodes:, css_analyzer:)
|
|
14
14
|
@nodes = nodes.reject { |node| NodeAnalyzer.new(node).empty? }
|
|
15
15
|
@css_analyzer = css_analyzer
|
|
16
16
|
|
|
@@ -32,6 +32,7 @@ module ArticleJSON
|
|
|
32
32
|
match = /(.*?)[\s\u00A0]+\[(?<tags>.*)\]/
|
|
33
33
|
.match(@type_node.inner_text)
|
|
34
34
|
return [] unless match
|
|
35
|
+
|
|
35
36
|
match[:tags].split(' ')
|
|
36
37
|
end
|
|
37
38
|
|
|
@@ -43,6 +43,7 @@ module ArticleJSON
|
|
|
43
43
|
if @node.name == 'span' &&
|
|
44
44
|
@node.first_element_child&.name == 'a' &&
|
|
45
45
|
@node.first_element_child&.has_attribute?('href')
|
|
46
|
+
|
|
46
47
|
strip_google_redirect(
|
|
47
48
|
@node.first_element_child.attribute('href').value
|
|
48
49
|
)
|
|
@@ -68,6 +69,7 @@ module ArticleJSON
|
|
|
68
69
|
def extract(node:, css_analyzer:)
|
|
69
70
|
node.children.map do |child_node|
|
|
70
71
|
next if NodeAnalyzer.new(child_node).empty?
|
|
72
|
+
|
|
71
73
|
new(node: child_node, css_analyzer: css_analyzer).element
|
|
72
74
|
end.compact
|
|
73
75
|
end
|
|
@@ -40,6 +40,7 @@ module ArticleJSON
|
|
|
40
40
|
# @return [Array[ArticleJSON::Elements::Base|Object]]
|
|
41
41
|
def merge_elements
|
|
42
42
|
return @additional_elements if @elements.nil? || @elements.empty?
|
|
43
|
+
|
|
43
44
|
remaining_elements = @additional_elements.dup
|
|
44
45
|
next_in = insert_next_element_in(0, remaining_elements)
|
|
45
46
|
characters_passed = 0
|
|
@@ -48,6 +49,7 @@ module ArticleJSON
|
|
|
48
49
|
.each_with_object([]) do |(element, next_element), result|
|
|
49
50
|
result << element
|
|
50
51
|
next if remaining_elements.empty?
|
|
52
|
+
|
|
51
53
|
if element.respond_to?(:length)
|
|
52
54
|
characters_passed += element.length
|
|
53
55
|
next_in -= element.length
|
|
@@ -23,8 +23,10 @@ module ArticleJSON
|
|
|
23
23
|
def unavailable_message
|
|
24
24
|
[
|
|
25
25
|
ArticleJSON::Elements::Text.new(content: "The #{name} "),
|
|
26
|
-
ArticleJSON::Elements::Text.new(
|
|
27
|
-
|
|
26
|
+
ArticleJSON::Elements::Text.new(
|
|
27
|
+
content: source_url,
|
|
28
|
+
href: source_url
|
|
29
|
+
),
|
|
28
30
|
ArticleJSON::Elements::Text.new(content: ' is not available.'),
|
|
29
31
|
]
|
|
30
32
|
end
|
|
@@ -44,19 +46,27 @@ module ArticleJSON
|
|
|
44
46
|
# @return [Hash|nil]
|
|
45
47
|
def parsed_api_response
|
|
46
48
|
return @api_response if defined? @api_response
|
|
49
|
+
|
|
47
50
|
@api_response = begin
|
|
48
51
|
uri = URI.parse(oembed_url)
|
|
49
52
|
http = Net::HTTP.new(uri.host, uri.port)
|
|
50
53
|
http.use_ssl = (uri.scheme == 'https')
|
|
51
54
|
response = http.request(Net::HTTP::Get.new(uri, http_headers))
|
|
52
|
-
|
|
53
|
-
|
|
55
|
+
|
|
56
|
+
if response.is_a? Net::HTTPSuccess
|
|
57
|
+
data = JSON.parse(response.body, symbolize_names: true)
|
|
58
|
+
transform_api_response(data)
|
|
54
59
|
end
|
|
55
60
|
rescue Net::ProtocolError, JSON::ParserError
|
|
56
61
|
nil
|
|
57
62
|
end
|
|
58
63
|
end
|
|
59
64
|
|
|
65
|
+
# @return [Hash]
|
|
66
|
+
def transform_api_response(data)
|
|
67
|
+
data
|
|
68
|
+
end
|
|
69
|
+
|
|
60
70
|
# @return [Hash]
|
|
61
71
|
def http_headers
|
|
62
72
|
headers = { 'Content-Type' => 'application/json' }
|
|
@@ -11,7 +11,8 @@ module ArticleJSON
|
|
|
11
11
|
# The URL for the oembed API call
|
|
12
12
|
# @return [String]
|
|
13
13
|
def oembed_url
|
|
14
|
-
"https://
|
|
14
|
+
"https://graph.facebook.com/v9.0/oembed_video?url=#{source_url}" \
|
|
15
|
+
"&access_token=#{access_token}"
|
|
15
16
|
end
|
|
16
17
|
|
|
17
18
|
# The video URL of the element
|
|
@@ -19,6 +20,21 @@ module ArticleJSON
|
|
|
19
20
|
def source_url
|
|
20
21
|
"https://www.facebook.com/facebook/videos/#{@element.embed_id}"
|
|
21
22
|
end
|
|
23
|
+
|
|
24
|
+
# The facebook access token. If not set, it raises an exception
|
|
25
|
+
# explaining how to configure it.
|
|
26
|
+
#
|
|
27
|
+
# @return [String]
|
|
28
|
+
def access_token
|
|
29
|
+
token = ArticleJSON.configuration.facebook_token
|
|
30
|
+
|
|
31
|
+
if token.nil?
|
|
32
|
+
raise 'You need to configure the facebook token to use facebook' \
|
|
33
|
+
'embed videos, see:' \
|
|
34
|
+
'https://github.com/Devex/article_json#facebook-oembed'
|
|
35
|
+
end
|
|
36
|
+
token
|
|
37
|
+
end
|
|
22
38
|
end
|
|
23
39
|
end
|
|
24
40
|
end
|