article_json 0.3.7 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +31 -0
- data/README.md +118 -67
- data/bin/article_json_export_amp.rb +15 -0
- data/bin/article_json_export_apple_news.rb +15 -0
- data/bin/article_json_export_facebook.rb +16 -0
- data/bin/article_json_export_html.rb +1 -0
- data/bin/article_json_export_plain_text.rb +15 -0
- data/bin/article_json_parse_google_doc.rb +1 -0
- data/bin/check_google_doc_export.rb +41 -0
- data/bin/update_oembed_request-stubs.sh +11 -0
- data/bin/update_reference_document.sh +16 -0
- data/lib/article_json/article.rb +22 -2
- data/lib/article_json/configuration.rb +2 -1
- data/lib/article_json/elements/base.rb +0 -1
- data/lib/article_json/elements/image.rb +7 -3
- data/lib/article_json/export/amp/elements/embed.rb +1 -1
- data/lib/article_json/export/apple_news/elements/base.rb +53 -0
- data/lib/article_json/export/apple_news/elements/embed.rb +130 -0
- data/lib/article_json/export/apple_news/elements/heading.rb +32 -0
- data/lib/article_json/export/apple_news/elements/image.rb +58 -0
- data/lib/article_json/export/apple_news/elements/list.rb +67 -0
- data/lib/article_json/export/apple_news/elements/paragraph.rb +36 -0
- data/lib/article_json/export/apple_news/elements/quote.rb +60 -0
- data/lib/article_json/export/apple_news/elements/text.rb +42 -0
- data/lib/article_json/export/apple_news/elements/text_box.rb +51 -0
- data/lib/article_json/export/apple_news/exporter.rb +37 -0
- data/lib/article_json/export/common/html/elements/image.rb +1 -1
- data/lib/article_json/import/google_doc/html/image_parser.rb +24 -3
- data/lib/article_json/import/google_doc/html/node_analyzer.rb +11 -1
- data/lib/article_json/import/google_doc/html/parser.rb +6 -1
- data/lib/article_json/utils/o_embed_resolver/facebook_video.rb +17 -1
- data/lib/article_json/utils/o_embed_resolver/youtube_video.rb +1 -1
- data/lib/article_json/version.rb +1 -1
- data/lib/article_json.rb +11 -0
- metadata +37 -15
@@ -0,0 +1,53 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Export
|
3
|
+
module AppleNews
|
4
|
+
module Elements
|
5
|
+
class Base
|
6
|
+
include ArticleJSON::Export::Common::Elements::Base
|
7
|
+
|
8
|
+
# Export the given element. Dynamically looks up the right
|
9
|
+
# export-element-class, instantiates it and then calls the `#export`
|
10
|
+
# method.
|
11
|
+
# Defaults to nil, e.g. if no exporter is specified for the given
|
12
|
+
# type.
|
13
|
+
# @return [String]
|
14
|
+
def export
|
15
|
+
super || nil
|
16
|
+
end
|
17
|
+
|
18
|
+
class << self
|
19
|
+
# Return the module namespace this class and its subclasses are
|
20
|
+
# nested within.
|
21
|
+
# @return [Module]
|
22
|
+
def namespace
|
23
|
+
ArticleJSON::Export::AppleNews::Elements
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
# The format this exporter is returning. This is used to determine
|
29
|
+
# which custom element exporters should be applied from the
|
30
|
+
# configuration.
|
31
|
+
# @return [Symbol]
|
32
|
+
def export_format
|
33
|
+
:apple_news
|
34
|
+
end
|
35
|
+
|
36
|
+
def default_exporter_mapping
|
37
|
+
{
|
38
|
+
text: namespace::Text,
|
39
|
+
paragraph: namespace::Paragraph,
|
40
|
+
heading: namespace::Heading,
|
41
|
+
quote: namespace::Quote,
|
42
|
+
list: namespace::List,
|
43
|
+
image: namespace::Image,
|
44
|
+
embed: namespace::Embed,
|
45
|
+
text_box: namespace::TextBox,
|
46
|
+
}
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,130 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Export
|
3
|
+
module AppleNews
|
4
|
+
module Elements
|
5
|
+
class Embed < Base
|
6
|
+
# Embed| Embed, Caption
|
7
|
+
# @return [Hash, Array<Hash>]
|
8
|
+
def export
|
9
|
+
caption_text.nil? ? embed : [embed, caption]
|
10
|
+
end
|
11
|
+
|
12
|
+
private
|
13
|
+
|
14
|
+
# Embed
|
15
|
+
# @return [Hash]
|
16
|
+
def embed
|
17
|
+
{
|
18
|
+
role: role,
|
19
|
+
URL: source_url,
|
20
|
+
caption: caption_text,
|
21
|
+
}.compact
|
22
|
+
end
|
23
|
+
|
24
|
+
# Caption
|
25
|
+
# @return [Hash]
|
26
|
+
def caption
|
27
|
+
{
|
28
|
+
role: 'caption',
|
29
|
+
text: caption_text,
|
30
|
+
format: 'html',
|
31
|
+
layout: 'captionLayout',
|
32
|
+
textStyle: 'captionStyle',
|
33
|
+
}
|
34
|
+
end
|
35
|
+
|
36
|
+
# Get the exporter class for text elements
|
37
|
+
# @return [ArticleJSON::Export::Common::HTML::Elements::Base]
|
38
|
+
def text_exporter
|
39
|
+
self.class.exporter_by_type(:text)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Caption Text
|
43
|
+
# @return [String]
|
44
|
+
def caption_text
|
45
|
+
return nil if role.nil? # Do not show captions for unsupported components
|
46
|
+
|
47
|
+
text.empty? ? nil : text
|
48
|
+
end
|
49
|
+
|
50
|
+
# @return [String]
|
51
|
+
def text
|
52
|
+
@element.caption.map do |child_element|
|
53
|
+
text_exporter.new(child_element)
|
54
|
+
.export
|
55
|
+
end.join
|
56
|
+
end
|
57
|
+
|
58
|
+
def role
|
59
|
+
@role ||=
|
60
|
+
case embed_type
|
61
|
+
when :youtube_video, :vimeo_video, :dailymotion_video
|
62
|
+
:embedwebvideo
|
63
|
+
when :facebook_video
|
64
|
+
:facebook_post
|
65
|
+
when :tweet
|
66
|
+
:tweet
|
67
|
+
when :slideshare
|
68
|
+
nil
|
69
|
+
when :soundcloud
|
70
|
+
nil
|
71
|
+
else
|
72
|
+
nil
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def source_url
|
77
|
+
case embed_type
|
78
|
+
when :youtube_video
|
79
|
+
build_embeded_youtube_url
|
80
|
+
when :vimeo_video
|
81
|
+
build_embeded_vimeo_url
|
82
|
+
when :dailymotion_video
|
83
|
+
build_embeded_vimeo_url
|
84
|
+
when :facebook_video
|
85
|
+
build_facebook_video_url
|
86
|
+
when :tweet
|
87
|
+
build_twitter_url
|
88
|
+
when :slideshare
|
89
|
+
nil
|
90
|
+
when :soundcloud
|
91
|
+
nil
|
92
|
+
else
|
93
|
+
nil
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def build_embeded_youtube_url
|
98
|
+
"https://www.youtube.com/embed/#{embed_id}"
|
99
|
+
end
|
100
|
+
|
101
|
+
def build_embeded_vimeo_url
|
102
|
+
"https://player.vimeo.com/video/#{embed_id}"
|
103
|
+
end
|
104
|
+
|
105
|
+
def build_embeded_dailymotion_url
|
106
|
+
"https://geo.dailymotion.com/player.html?video=#{embed_id}"
|
107
|
+
end
|
108
|
+
|
109
|
+
def build_facebook_video_url
|
110
|
+
username, id = embed_id.to_s.split("/", 2)
|
111
|
+
"https://www.facebook.com/#{username}/videos/#{id}"
|
112
|
+
end
|
113
|
+
|
114
|
+
def build_twitter_url
|
115
|
+
username, id = embed_id.to_s.split("/", 2)
|
116
|
+
"https://twitter.com/#{username}/status/#{id}"
|
117
|
+
end
|
118
|
+
|
119
|
+
def embed_type
|
120
|
+
@embed_type ||= @element.embed_type.to_sym
|
121
|
+
end
|
122
|
+
|
123
|
+
def embed_id
|
124
|
+
@embed_id ||= @element.embed_id.to_sym
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Export
|
3
|
+
module AppleNews
|
4
|
+
module Elements
|
5
|
+
class Heading < Base
|
6
|
+
# Headline
|
7
|
+
# @return [Hash]
|
8
|
+
def export
|
9
|
+
{
|
10
|
+
role: role,
|
11
|
+
text: @element.content,
|
12
|
+
layout: 'titleLayout',
|
13
|
+
textStyle: 'defaultTitle',
|
14
|
+
}
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
# The role of text component for adding a heading. (Required) Always
|
20
|
+
# one of these roles for this component: heading, heading1, heading2,
|
21
|
+
# heading3, heading4, heading5, or heading6.
|
22
|
+
# @return [String]
|
23
|
+
def role
|
24
|
+
return 'heading' if @element.level.nil?
|
25
|
+
|
26
|
+
"heading#{@element.level}"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Export
|
3
|
+
module AppleNews
|
4
|
+
module Elements
|
5
|
+
class Image < Base
|
6
|
+
# Image | Image, Caption
|
7
|
+
# @return [Hash, Array<Hash>]
|
8
|
+
def export
|
9
|
+
caption_text.nil? ? image : [image, caption]
|
10
|
+
end
|
11
|
+
|
12
|
+
private
|
13
|
+
# Image
|
14
|
+
# @return [Hash]
|
15
|
+
def image
|
16
|
+
{
|
17
|
+
role: 'image',
|
18
|
+
URL: @element.source_url,
|
19
|
+
caption: caption_text,
|
20
|
+
}.compact
|
21
|
+
end
|
22
|
+
|
23
|
+
# Caption
|
24
|
+
# @return [Hash]
|
25
|
+
def caption
|
26
|
+
{
|
27
|
+
role: 'caption',
|
28
|
+
text: caption_text,
|
29
|
+
format: 'html',
|
30
|
+
layout: 'captionLayout',
|
31
|
+
textStyle: 'captionStyle',
|
32
|
+
}
|
33
|
+
end
|
34
|
+
|
35
|
+
# Get the exporter class for text elements
|
36
|
+
# @return [ArticleJSON::Export::Common::HTML::Elements::Base]
|
37
|
+
def text_exporter
|
38
|
+
self.class.exporter_by_type(:text)
|
39
|
+
end
|
40
|
+
|
41
|
+
# Caption Text
|
42
|
+
# @return [String]
|
43
|
+
def caption_text
|
44
|
+
text.empty? ? nil : text
|
45
|
+
end
|
46
|
+
|
47
|
+
# @return [String]
|
48
|
+
def text
|
49
|
+
@element.caption.map do |child_element|
|
50
|
+
text_exporter.new(child_element)
|
51
|
+
.export
|
52
|
+
end.join
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Export
|
3
|
+
module AppleNews
|
4
|
+
module Elements
|
5
|
+
class List < Base
|
6
|
+
# List
|
7
|
+
# @return [Hash]
|
8
|
+
def export
|
9
|
+
{
|
10
|
+
role: 'body',
|
11
|
+
text: list_text,
|
12
|
+
format: 'html',
|
13
|
+
layout: 'bodyLayout',
|
14
|
+
textStyle: 'bodyStyle',
|
15
|
+
}
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
# Get the exporter class for text elements
|
21
|
+
#
|
22
|
+
# @return [ArticleJSON::Export::Common::HTML::Elements::<Class>]
|
23
|
+
def text_exporter
|
24
|
+
self.class.exporter_by_type(:text)
|
25
|
+
end
|
26
|
+
|
27
|
+
# When it is an unordered list wrap it in <ul></ul>
|
28
|
+
# When it is an ordered list wrap it in <ol></ol>
|
29
|
+
#
|
30
|
+
# List Text
|
31
|
+
# @return [String]
|
32
|
+
def list_text
|
33
|
+
prepend_list_tag + list + append_list_tag
|
34
|
+
end
|
35
|
+
|
36
|
+
# Each list item should be wrapped in <li></li>
|
37
|
+
#
|
38
|
+
# @return [String]
|
39
|
+
def list
|
40
|
+
@element.content.map do |paragraph_element|
|
41
|
+
line_item = paragraph_element.content.map do |text_element|
|
42
|
+
text_exporter.new(text_element).export
|
43
|
+
end.join
|
44
|
+
|
45
|
+
"<li>#{line_item}</li>"
|
46
|
+
end.join
|
47
|
+
end
|
48
|
+
|
49
|
+
# @return [String]
|
50
|
+
def prepend_list_tag
|
51
|
+
ordered_list? ? '<ol>' : '<ul>'
|
52
|
+
end
|
53
|
+
|
54
|
+
# @return [String]
|
55
|
+
def append_list_tag
|
56
|
+
ordered_list? ? '</ol>' : '</ul>'
|
57
|
+
end
|
58
|
+
|
59
|
+
# @return [Boolean]
|
60
|
+
def ordered_list?
|
61
|
+
@element.list_type == :ordered
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Export
|
3
|
+
module AppleNews
|
4
|
+
module Elements
|
5
|
+
class Paragraph < Base
|
6
|
+
# Generate the paragraph node with its containing text elements
|
7
|
+
# @return [Hash]
|
8
|
+
def export
|
9
|
+
{
|
10
|
+
role: 'body',
|
11
|
+
text: text,
|
12
|
+
format: 'html',
|
13
|
+
layout: 'bodyLayout',
|
14
|
+
textStyle: 'bodyStyle',
|
15
|
+
}
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
# Get the exporter class for text elements
|
21
|
+
# @return [ArticleJSON::Export::Common::HTML::Elements::Base]
|
22
|
+
def text_exporter
|
23
|
+
self.class.exporter_by_type(:text)
|
24
|
+
end
|
25
|
+
|
26
|
+
def text
|
27
|
+
@element.content.map do |child_element|
|
28
|
+
text_exporter.new(child_element)
|
29
|
+
.export
|
30
|
+
end.join
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Export
|
3
|
+
module AppleNews
|
4
|
+
module Elements
|
5
|
+
class Quote < Base
|
6
|
+
include ArticleJSON::Export::Common::HTML::Elements::Base
|
7
|
+
include ArticleJSON::Export::Common::HTML::Elements::Text
|
8
|
+
|
9
|
+
def export
|
10
|
+
[quote, author]
|
11
|
+
end
|
12
|
+
|
13
|
+
private
|
14
|
+
|
15
|
+
# Quote
|
16
|
+
# @return [Hash]
|
17
|
+
def quote
|
18
|
+
{
|
19
|
+
role: 'pullquote',
|
20
|
+
text: quote_text,
|
21
|
+
format: 'html',
|
22
|
+
layout: 'pullquoteLayout',
|
23
|
+
textStyle: 'pullquoteStyle',
|
24
|
+
}
|
25
|
+
end
|
26
|
+
|
27
|
+
# Author
|
28
|
+
# @return [Hash]
|
29
|
+
def author
|
30
|
+
{
|
31
|
+
role: 'author',
|
32
|
+
text: author_text,
|
33
|
+
format: 'html',
|
34
|
+
layout: 'pullquoteAttributeLayout',
|
35
|
+
textStyle: 'quoteAttributeStyle',
|
36
|
+
}
|
37
|
+
end
|
38
|
+
|
39
|
+
def text_exporter
|
40
|
+
self.class.exporter_by_type(:text)
|
41
|
+
end
|
42
|
+
|
43
|
+
# Quote Text
|
44
|
+
# @return [String]
|
45
|
+
def quote_text
|
46
|
+
element = @element.content.first&.content.first
|
47
|
+
text_exporter.new(element).export
|
48
|
+
end
|
49
|
+
|
50
|
+
# Author Text
|
51
|
+
# @return [String]
|
52
|
+
def author_text
|
53
|
+
element = @element.caption.first
|
54
|
+
text_exporter.new(element).export
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Export
|
3
|
+
module AppleNews
|
4
|
+
module Elements
|
5
|
+
class Text < Base
|
6
|
+
include ArticleJSON::Export::Common::HTML::Elements::Base
|
7
|
+
include ArticleJSON::Export::Common::HTML::Elements::Text
|
8
|
+
|
9
|
+
UNSUPPORTED_HTML_TAGS = %w[title meta script noscript style link applet object iframe
|
10
|
+
noframes form select option optgroup
|
11
|
+
].freeze
|
12
|
+
|
13
|
+
# A Nokogiri object is returned with`super`, which is is then
|
14
|
+
# returned as a either a string or as HTML (when not plain text),
|
15
|
+
# both of which are compatible with Apple News format. Takes into
|
16
|
+
# account bold, italic and href.
|
17
|
+
# @return [String]
|
18
|
+
def export
|
19
|
+
super.to_s
|
20
|
+
end
|
21
|
+
|
22
|
+
# @param [String] text
|
23
|
+
def create_text_nodes(text)
|
24
|
+
Nokogiri::HTML.fragment(sanitize_text(text).gsub(/\n/, '<br>')).children
|
25
|
+
end
|
26
|
+
|
27
|
+
# Removes UNSUPPORTED_TAGS from text
|
28
|
+
#
|
29
|
+
# @param [String] text
|
30
|
+
# @return [String]
|
31
|
+
def sanitize_text(text)
|
32
|
+
doc = Nokogiri::HTML.fragment(text)
|
33
|
+
UNSUPPORTED_HTML_TAGS.each do |tag|
|
34
|
+
doc.search(tag).each(&:remove)
|
35
|
+
end
|
36
|
+
doc.inner_html
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Export
|
3
|
+
module AppleNews
|
4
|
+
module Elements
|
5
|
+
class TextBox < Base
|
6
|
+
include ArticleJSON::Export::Common::HTML::Elements::TextBox
|
7
|
+
# List
|
8
|
+
# @return [Hash]
|
9
|
+
def export
|
10
|
+
{
|
11
|
+
role: 'container',
|
12
|
+
layout: 'textBoxLayout',
|
13
|
+
style: 'textBoxStyle',
|
14
|
+
components: map_styles(elements),
|
15
|
+
}
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
# @return [Array]
|
21
|
+
def elements
|
22
|
+
@element.content.map do |child_element|
|
23
|
+
case child_element
|
24
|
+
when ArticleJSON::Elements::Heading
|
25
|
+
namespace::Heading.new(child_element).export
|
26
|
+
when ArticleJSON::Elements::Paragraph
|
27
|
+
namespace::Paragraph.new(child_element).export
|
28
|
+
when ArticleJSON::Elements::List
|
29
|
+
namespace::List.new(child_element).export
|
30
|
+
else
|
31
|
+
namespace::Text.new(child_element).export
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# @return [Module]
|
37
|
+
def namespace
|
38
|
+
ArticleJSON::Export::AppleNews::Elements
|
39
|
+
end
|
40
|
+
|
41
|
+
# @return [Array]
|
42
|
+
def map_styles(elements)
|
43
|
+
elements.map do |child_element|
|
44
|
+
child_element.merge(layout: 'textBox' + child_element[:layout].sub(/\S/, &:upcase))
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module ArticleJSON
|
2
|
+
module Export
|
3
|
+
module AppleNews
|
4
|
+
class Exporter
|
5
|
+
# @param [Array[ArticleJSON::Elements::Base]] elements
|
6
|
+
def initialize(elements)
|
7
|
+
@elements = elements
|
8
|
+
end
|
9
|
+
|
10
|
+
# Return the components section of an Apple News Article as JSON
|
11
|
+
#
|
12
|
+
# Images and EmbededVideos are nested in an array with the components
|
13
|
+
# array when they contain captions. As Apple News skips over these
|
14
|
+
# nested arrays, we must flatten the array.
|
15
|
+
#
|
16
|
+
# @return [String]
|
17
|
+
def to_json
|
18
|
+
{ components: components.flatten }.to_json
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
# Generate an array with the plain text representation of all elements
|
24
|
+
#
|
25
|
+
# @return [Array]
|
26
|
+
def components
|
27
|
+
@components ||=
|
28
|
+
@elements.map do |element|
|
29
|
+
ArticleJSON::Export::AppleNews::Elements::Base
|
30
|
+
.build(element)
|
31
|
+
&.export
|
32
|
+
end.reject { |hash| hash.nil? || hash.empty? }
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -19,22 +19,34 @@ module ArticleJSON
|
|
19
19
|
@float_node = @node
|
20
20
|
end
|
21
21
|
|
22
|
+
# The value of the image's `alt` attribute
|
23
|
+
# @return [String]
|
24
|
+
def alt
|
25
|
+
return '' if image_url?
|
26
|
+
|
27
|
+
image_node.attribute('alt')&.value || ''
|
28
|
+
end
|
29
|
+
|
22
30
|
# The value of the image's `src` attribute
|
23
31
|
# @return [String]
|
24
32
|
def source_url
|
33
|
+
return @node.inner_text.strip if image_url?
|
34
|
+
|
25
35
|
image_node.attribute('src').value
|
26
36
|
end
|
27
37
|
|
28
38
|
# The node of the actual image
|
29
39
|
# @return [Nokogiri::HTML::Node]
|
30
40
|
def image_node
|
31
|
-
@
|
41
|
+
return @image_node if defined? @image_node
|
42
|
+
|
43
|
+
@image_node = @node.xpath('.//img').first
|
32
44
|
end
|
33
45
|
|
34
46
|
# Check if the image is floating (left, right or not at all)
|
35
47
|
# @return [Symbol]
|
36
48
|
def float
|
37
|
-
super if floatable_size?
|
49
|
+
super if image_url? || floatable_size?
|
38
50
|
end
|
39
51
|
|
40
52
|
# Extracts an href from the tag [image-link-to: url]) if present
|
@@ -54,7 +66,8 @@ module ArticleJSON
|
|
54
66
|
source_url: source_url,
|
55
67
|
float: float,
|
56
68
|
caption: caption,
|
57
|
-
href: @href
|
69
|
+
href: @href,
|
70
|
+
alt: alt
|
58
71
|
)
|
59
72
|
end
|
60
73
|
|
@@ -73,6 +86,7 @@ module ArticleJSON
|
|
73
86
|
def href_regexp
|
74
87
|
%r{\[image-link-to:\s+(?<url>.*?)\]}
|
75
88
|
end
|
89
|
+
|
76
90
|
# Check if the image's width can be determined and is less than 500px
|
77
91
|
# This is about 3/4 of the google document width...
|
78
92
|
# @return [Boolean]
|
@@ -94,6 +108,13 @@ module ArticleJSON
|
|
94
108
|
match['px'].to_i if match && match['px']
|
95
109
|
end
|
96
110
|
end
|
111
|
+
|
112
|
+
# When the current node doesn't contain an actual image tag,
|
113
|
+
# we're dealing with an image URL
|
114
|
+
# @return [Boolean]
|
115
|
+
def image_url?
|
116
|
+
image_node.nil?
|
117
|
+
end
|
97
118
|
end
|
98
119
|
end
|
99
120
|
end
|
@@ -88,7 +88,17 @@ module ArticleJSON
|
|
88
88
|
# @return [Boolean]
|
89
89
|
def image?
|
90
90
|
return @is_image if defined? @is_image
|
91
|
-
@is_image = node.xpath('.//img').length > 0
|
91
|
+
@is_image = image_url? || node.xpath('.//img').length > 0
|
92
|
+
end
|
93
|
+
|
94
|
+
# Check if the node contains an image URL
|
95
|
+
# @return [Boolean]
|
96
|
+
def image_url?
|
97
|
+
return @is_image_url if defined? @is_image_url
|
98
|
+
|
99
|
+
text = node.inner_text.strip
|
100
|
+
url_regexp = %r{https?:\/\/\S+\.(?:jpg|jpeg|png|gif)}i
|
101
|
+
@is_image_url = !!(url_regexp =~ text)
|
92
102
|
end
|
93
103
|
|
94
104
|
# Check if the node contains an embedded element
|
@@ -6,7 +6,12 @@ module ArticleJSON
|
|
6
6
|
# @param [String] html
|
7
7
|
def initialize(html)
|
8
8
|
doc = Nokogiri::HTML(html)
|
9
|
-
|
9
|
+
selection = if doc.xpath('//body/div').empty?
|
10
|
+
doc.xpath('//body')
|
11
|
+
else
|
12
|
+
doc.xpath('//body/div')
|
13
|
+
end
|
14
|
+
@body_enumerator = selection.last.children.to_enum
|
10
15
|
|
11
16
|
css_node = doc.xpath('//head/style').last
|
12
17
|
@css_analyzer = CSSAnalyzer.new(css_node&.inner_text)
|