article_json 0.3.8 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +24 -0
  3. data/README.md +108 -72
  4. data/bin/article_json_export_amp.rb +1 -0
  5. data/bin/article_json_export_apple_news.rb +15 -0
  6. data/bin/article_json_export_facebook.rb +1 -0
  7. data/bin/article_json_export_html.rb +1 -0
  8. data/bin/article_json_export_plain_text.rb +1 -0
  9. data/bin/article_json_parse_google_doc.rb +1 -0
  10. data/bin/check_google_doc_export.rb +41 -0
  11. data/bin/update_oembed_request-stubs.sh +1 -3
  12. data/bin/update_reference_document.sh +4 -0
  13. data/lib/article_json/article.rb +22 -2
  14. data/lib/article_json/configuration.rb +2 -1
  15. data/lib/article_json/elements/base.rb +0 -1
  16. data/lib/article_json/export/amp/elements/embed.rb +1 -1
  17. data/lib/article_json/export/apple_news/elements/base.rb +53 -0
  18. data/lib/article_json/export/apple_news/elements/embed.rb +130 -0
  19. data/lib/article_json/export/apple_news/elements/heading.rb +32 -0
  20. data/lib/article_json/export/apple_news/elements/image.rb +58 -0
  21. data/lib/article_json/export/apple_news/elements/list.rb +67 -0
  22. data/lib/article_json/export/apple_news/elements/paragraph.rb +36 -0
  23. data/lib/article_json/export/apple_news/elements/quote.rb +60 -0
  24. data/lib/article_json/export/apple_news/elements/text.rb +42 -0
  25. data/lib/article_json/export/apple_news/elements/text_box.rb +51 -0
  26. data/lib/article_json/export/apple_news/exporter.rb +37 -0
  27. data/lib/article_json/import/google_doc/html/image_parser.rb +16 -2
  28. data/lib/article_json/import/google_doc/html/node_analyzer.rb +11 -1
  29. data/lib/article_json/import/google_doc/html/parser.rb +6 -1
  30. data/lib/article_json/utils/o_embed_resolver/facebook_video.rb +17 -1
  31. data/lib/article_json/utils/o_embed_resolver/youtube_video.rb +1 -1
  32. data/lib/article_json/version.rb +1 -1
  33. data/lib/article_json.rb +11 -0
  34. metadata +33 -15
@@ -0,0 +1,130 @@
1
+ module ArticleJSON
2
+ module Export
3
+ module AppleNews
4
+ module Elements
5
+ class Embed < Base
6
+ # Embed| Embed, Caption
7
+ # @return [Hash, Array<Hash>]
8
+ def export
9
+ caption_text.nil? ? embed : [embed, caption]
10
+ end
11
+
12
+ private
13
+
14
+ # Embed
15
+ # @return [Hash]
16
+ def embed
17
+ {
18
+ role: role,
19
+ URL: source_url,
20
+ caption: caption_text,
21
+ }.compact
22
+ end
23
+
24
+ # Caption
25
+ # @return [Hash]
26
+ def caption
27
+ {
28
+ role: 'caption',
29
+ text: caption_text,
30
+ format: 'html',
31
+ layout: 'captionLayout',
32
+ textStyle: 'captionStyle',
33
+ }
34
+ end
35
+
36
+ # Get the exporter class for text elements
37
+ # @return [ArticleJSON::Export::Common::HTML::Elements::Base]
38
+ def text_exporter
39
+ self.class.exporter_by_type(:text)
40
+ end
41
+
42
+ # Caption Text
43
+ # @return [String]
44
+ def caption_text
45
+ return nil if role.nil? # Do not show captions for unsupported components
46
+
47
+ text.empty? ? nil : text
48
+ end
49
+
50
+ # @return [String]
51
+ def text
52
+ @element.caption.map do |child_element|
53
+ text_exporter.new(child_element)
54
+ .export
55
+ end.join
56
+ end
57
+
58
+ def role
59
+ @role ||=
60
+ case embed_type
61
+ when :youtube_video, :vimeo_video, :dailymotion_video
62
+ :embedwebvideo
63
+ when :facebook_video
64
+ :facebook_post
65
+ when :tweet
66
+ :tweet
67
+ when :slideshare
68
+ nil
69
+ when :soundcloud
70
+ nil
71
+ else
72
+ nil
73
+ end
74
+ end
75
+
76
+ def source_url
77
+ case embed_type
78
+ when :youtube_video
79
+ build_embeded_youtube_url
80
+ when :vimeo_video
81
+ build_embeded_vimeo_url
82
+ when :dailymotion_video
83
+ build_embeded_vimeo_url
84
+ when :facebook_video
85
+ build_facebook_video_url
86
+ when :tweet
87
+ build_twitter_url
88
+ when :slideshare
89
+ nil
90
+ when :soundcloud
91
+ nil
92
+ else
93
+ nil
94
+ end
95
+ end
96
+
97
+ def build_embeded_youtube_url
98
+ "https://www.youtube.com/embed/#{embed_id}"
99
+ end
100
+
101
+ def build_embeded_vimeo_url
102
+ "https://player.vimeo.com/video/#{embed_id}"
103
+ end
104
+
105
+ def build_embeded_dailymotion_url
106
+ "https://geo.dailymotion.com/player.html?video=#{embed_id}"
107
+ end
108
+
109
+ def build_facebook_video_url
110
+ username, id = embed_id.to_s.split("/", 2)
111
+ "https://www.facebook.com/#{username}/videos/#{id}"
112
+ end
113
+
114
+ def build_twitter_url
115
+ username, id = embed_id.to_s.split("/", 2)
116
+ "https://twitter.com/#{username}/status/#{id}"
117
+ end
118
+
119
+ def embed_type
120
+ @embed_type ||= @element.embed_type.to_sym
121
+ end
122
+
123
+ def embed_id
124
+ @embed_id ||= @element.embed_id.to_sym
125
+ end
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,32 @@
1
+ module ArticleJSON
2
+ module Export
3
+ module AppleNews
4
+ module Elements
5
+ class Heading < Base
6
+ # Headline
7
+ # @return [Hash]
8
+ def export
9
+ {
10
+ role: role,
11
+ text: @element.content,
12
+ layout: 'titleLayout',
13
+ textStyle: 'defaultTitle',
14
+ }
15
+ end
16
+
17
+ private
18
+
19
+ # The role of text component for adding a heading. (Required) Always
20
+ # one of these roles for this component: heading, heading1, heading2,
21
+ # heading3, heading4, heading5, or heading6.
22
+ # @return [String]
23
+ def role
24
+ return 'heading' if @element.level.nil?
25
+
26
+ "heading#{@element.level}"
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,58 @@
1
+ module ArticleJSON
2
+ module Export
3
+ module AppleNews
4
+ module Elements
5
+ class Image < Base
6
+ # Image | Image, Caption
7
+ # @return [Hash, Array<Hash>]
8
+ def export
9
+ caption_text.nil? ? image : [image, caption]
10
+ end
11
+
12
+ private
13
+ # Image
14
+ # @return [Hash]
15
+ def image
16
+ {
17
+ role: 'image',
18
+ URL: @element.source_url,
19
+ caption: caption_text,
20
+ }.compact
21
+ end
22
+
23
+ # Caption
24
+ # @return [Hash]
25
+ def caption
26
+ {
27
+ role: 'caption',
28
+ text: caption_text,
29
+ format: 'html',
30
+ layout: 'captionLayout',
31
+ textStyle: 'captionStyle',
32
+ }
33
+ end
34
+
35
+ # Get the exporter class for text elements
36
+ # @return [ArticleJSON::Export::Common::HTML::Elements::Base]
37
+ def text_exporter
38
+ self.class.exporter_by_type(:text)
39
+ end
40
+
41
+ # Caption Text
42
+ # @return [String]
43
+ def caption_text
44
+ text.empty? ? nil : text
45
+ end
46
+
47
+ # @return [String]
48
+ def text
49
+ @element.caption.map do |child_element|
50
+ text_exporter.new(child_element)
51
+ .export
52
+ end.join
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,67 @@
1
+ module ArticleJSON
2
+ module Export
3
+ module AppleNews
4
+ module Elements
5
+ class List < Base
6
+ # List
7
+ # @return [Hash]
8
+ def export
9
+ {
10
+ role: 'body',
11
+ text: list_text,
12
+ format: 'html',
13
+ layout: 'bodyLayout',
14
+ textStyle: 'bodyStyle',
15
+ }
16
+ end
17
+
18
+ private
19
+
20
+ # Get the exporter class for text elements
21
+ #
22
+ # @return [ArticleJSON::Export::Common::HTML::Elements::<Class>]
23
+ def text_exporter
24
+ self.class.exporter_by_type(:text)
25
+ end
26
+
27
+ # When it is an unordered list wrap it in <ul></ul>
28
+ # When it is an ordered list wrap it in <ol></ol>
29
+ #
30
+ # List Text
31
+ # @return [String]
32
+ def list_text
33
+ prepend_list_tag + list + append_list_tag
34
+ end
35
+
36
+ # Each list item should be wrapped in <li></li>
37
+ #
38
+ # @return [String]
39
+ def list
40
+ @element.content.map do |paragraph_element|
41
+ line_item = paragraph_element.content.map do |text_element|
42
+ text_exporter.new(text_element).export
43
+ end.join
44
+
45
+ "<li>#{line_item}</li>"
46
+ end.join
47
+ end
48
+
49
+ # @return [String]
50
+ def prepend_list_tag
51
+ ordered_list? ? '<ol>' : '<ul>'
52
+ end
53
+
54
+ # @return [String]
55
+ def append_list_tag
56
+ ordered_list? ? '</ol>' : '</ul>'
57
+ end
58
+
59
+ # @return [Boolean]
60
+ def ordered_list?
61
+ @element.list_type == :ordered
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,36 @@
1
+ module ArticleJSON
2
+ module Export
3
+ module AppleNews
4
+ module Elements
5
+ class Paragraph < Base
6
+ # Generate the paragraph node with its containing text elements
7
+ # @return [Hash]
8
+ def export
9
+ {
10
+ role: 'body',
11
+ text: text,
12
+ format: 'html',
13
+ layout: 'bodyLayout',
14
+ textStyle: 'bodyStyle',
15
+ }
16
+ end
17
+
18
+ private
19
+
20
+ # Get the exporter class for text elements
21
+ # @return [ArticleJSON::Export::Common::HTML::Elements::Base]
22
+ def text_exporter
23
+ self.class.exporter_by_type(:text)
24
+ end
25
+
26
+ def text
27
+ @element.content.map do |child_element|
28
+ text_exporter.new(child_element)
29
+ .export
30
+ end.join
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,60 @@
1
+ module ArticleJSON
2
+ module Export
3
+ module AppleNews
4
+ module Elements
5
+ class Quote < Base
6
+ include ArticleJSON::Export::Common::HTML::Elements::Base
7
+ include ArticleJSON::Export::Common::HTML::Elements::Text
8
+
9
+ def export
10
+ [quote, author]
11
+ end
12
+
13
+ private
14
+
15
+ # Quote
16
+ # @return [Hash]
17
+ def quote
18
+ {
19
+ role: 'pullquote',
20
+ text: quote_text,
21
+ format: 'html',
22
+ layout: 'pullquoteLayout',
23
+ textStyle: 'pullquoteStyle',
24
+ }
25
+ end
26
+
27
+ # Author
28
+ # @return [Hash]
29
+ def author
30
+ {
31
+ role: 'author',
32
+ text: author_text,
33
+ format: 'html',
34
+ layout: 'pullquoteAttributeLayout',
35
+ textStyle: 'quoteAttributeStyle',
36
+ }
37
+ end
38
+
39
+ def text_exporter
40
+ self.class.exporter_by_type(:text)
41
+ end
42
+
43
+ # Quote Text
44
+ # @return [String]
45
+ def quote_text
46
+ element = @element.content.first&.content.first
47
+ text_exporter.new(element).export
48
+ end
49
+
50
+ # Author Text
51
+ # @return [String]
52
+ def author_text
53
+ element = @element.caption.first
54
+ text_exporter.new(element).export
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,42 @@
1
+ module ArticleJSON
2
+ module Export
3
+ module AppleNews
4
+ module Elements
5
+ class Text < Base
6
+ include ArticleJSON::Export::Common::HTML::Elements::Base
7
+ include ArticleJSON::Export::Common::HTML::Elements::Text
8
+
9
+ UNSUPPORTED_HTML_TAGS = %w[title meta script noscript style link applet object iframe
10
+ noframes form select option optgroup
11
+ ].freeze
12
+
13
+ # A Nokogiri object is returned with`super`, which is is then
14
+ # returned as a either a string or as HTML (when not plain text),
15
+ # both of which are compatible with Apple News format. Takes into
16
+ # account bold, italic and href.
17
+ # @return [String]
18
+ def export
19
+ super.to_s
20
+ end
21
+
22
+ # @param [String] text
23
+ def create_text_nodes(text)
24
+ Nokogiri::HTML.fragment(sanitize_text(text).gsub(/\n/, '<br>')).children
25
+ end
26
+
27
+ # Removes UNSUPPORTED_TAGS from text
28
+ #
29
+ # @param [String] text
30
+ # @return [String]
31
+ def sanitize_text(text)
32
+ doc = Nokogiri::HTML.fragment(text)
33
+ UNSUPPORTED_HTML_TAGS.each do |tag|
34
+ doc.search(tag).each(&:remove)
35
+ end
36
+ doc.inner_html
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,51 @@
1
+ module ArticleJSON
2
+ module Export
3
+ module AppleNews
4
+ module Elements
5
+ class TextBox < Base
6
+ include ArticleJSON::Export::Common::HTML::Elements::TextBox
7
+ # List
8
+ # @return [Hash]
9
+ def export
10
+ {
11
+ role: 'container',
12
+ layout: 'textBoxLayout',
13
+ style: 'textBoxStyle',
14
+ components: map_styles(elements),
15
+ }
16
+ end
17
+
18
+ private
19
+
20
+ # @return [Array]
21
+ def elements
22
+ @element.content.map do |child_element|
23
+ case child_element
24
+ when ArticleJSON::Elements::Heading
25
+ namespace::Heading.new(child_element).export
26
+ when ArticleJSON::Elements::Paragraph
27
+ namespace::Paragraph.new(child_element).export
28
+ when ArticleJSON::Elements::List
29
+ namespace::List.new(child_element).export
30
+ else
31
+ namespace::Text.new(child_element).export
32
+ end
33
+ end
34
+ end
35
+
36
+ # @return [Module]
37
+ def namespace
38
+ ArticleJSON::Export::AppleNews::Elements
39
+ end
40
+
41
+ # @return [Array]
42
+ def map_styles(elements)
43
+ elements.map do |child_element|
44
+ child_element.merge(layout: 'textBox' + child_element[:layout].sub(/\S/, &:upcase))
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,37 @@
1
+ module ArticleJSON
2
+ module Export
3
+ module AppleNews
4
+ class Exporter
5
+ # @param [Array[ArticleJSON::Elements::Base]] elements
6
+ def initialize(elements)
7
+ @elements = elements
8
+ end
9
+
10
+ # Return the components section of an Apple News Article as JSON
11
+ #
12
+ # Images and EmbededVideos are nested in an array with the components
13
+ # array when they contain captions. As Apple News skips over these
14
+ # nested arrays, we must flatten the array.
15
+ #
16
+ # @return [String]
17
+ def to_json
18
+ { components: components.flatten }.to_json
19
+ end
20
+
21
+ private
22
+
23
+ # Generate an array with the plain text representation of all elements
24
+ #
25
+ # @return [Array]
26
+ def components
27
+ @components ||=
28
+ @elements.map do |element|
29
+ ArticleJSON::Export::AppleNews::Elements::Base
30
+ .build(element)
31
+ &.export
32
+ end.reject { |hash| hash.nil? || hash.empty? }
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -22,25 +22,31 @@ module ArticleJSON
22
22
  # The value of the image's `alt` attribute
23
23
  # @return [String]
24
24
  def alt
25
+ return '' if image_url?
26
+
25
27
  image_node.attribute('alt')&.value || ''
26
28
  end
27
29
 
28
30
  # The value of the image's `src` attribute
29
31
  # @return [String]
30
32
  def source_url
33
+ return @node.inner_text.strip if image_url?
34
+
31
35
  image_node.attribute('src').value
32
36
  end
33
37
 
34
38
  # The node of the actual image
35
39
  # @return [Nokogiri::HTML::Node]
36
40
  def image_node
37
- @node.xpath('.//img').first
41
+ return @image_node if defined? @image_node
42
+
43
+ @image_node = @node.xpath('.//img').first
38
44
  end
39
45
 
40
46
  # Check if the image is floating (left, right or not at all)
41
47
  # @return [Symbol]
42
48
  def float
43
- super if floatable_size?
49
+ super if image_url? || floatable_size?
44
50
  end
45
51
 
46
52
  # Extracts an href from the tag [image-link-to: url]) if present
@@ -80,6 +86,7 @@ module ArticleJSON
80
86
  def href_regexp
81
87
  %r{\[image-link-to:\s+(?<url>.*?)\]}
82
88
  end
89
+
83
90
  # Check if the image's width can be determined and is less than 500px
84
91
  # This is about 3/4 of the google document width...
85
92
  # @return [Boolean]
@@ -101,6 +108,13 @@ module ArticleJSON
101
108
  match['px'].to_i if match && match['px']
102
109
  end
103
110
  end
111
+
112
+ # When the current node doesn't contain an actual image tag,
113
+ # we're dealing with an image URL
114
+ # @return [Boolean]
115
+ def image_url?
116
+ image_node.nil?
117
+ end
104
118
  end
105
119
  end
106
120
  end
@@ -88,7 +88,17 @@ module ArticleJSON
88
88
  # @return [Boolean]
89
89
  def image?
90
90
  return @is_image if defined? @is_image
91
- @is_image = node.xpath('.//img').length > 0
91
+ @is_image = image_url? || node.xpath('.//img').length > 0
92
+ end
93
+
94
+ # Check if the node contains an image URL
95
+ # @return [Boolean]
96
+ def image_url?
97
+ return @is_image_url if defined? @is_image_url
98
+
99
+ text = node.inner_text.strip
100
+ url_regexp = %r{https?:\/\/\S+\.(?:jpg|jpeg|png|gif)}i
101
+ @is_image_url = !!(url_regexp =~ text)
92
102
  end
93
103
 
94
104
  # Check if the node contains an embedded element
@@ -6,7 +6,12 @@ module ArticleJSON
6
6
  # @param [String] html
7
7
  def initialize(html)
8
8
  doc = Nokogiri::HTML(html)
9
- @body_enumerator = doc.xpath('//body').last.children.to_enum
9
+ selection = if doc.xpath('//body/div').empty?
10
+ doc.xpath('//body')
11
+ else
12
+ doc.xpath('//body/div')
13
+ end
14
+ @body_enumerator = selection.last.children.to_enum
10
15
 
11
16
  css_node = doc.xpath('//head/style').last
12
17
  @css_analyzer = CSSAnalyzer.new(css_node&.inner_text)
@@ -11,7 +11,8 @@ module ArticleJSON
11
11
  # The URL for the oembed API call
12
12
  # @return [String]
13
13
  def oembed_url
14
- "https://www.facebook.com/plugins/video/oembed.json?url=#{source_url}"
14
+ "https://graph.facebook.com/v9.0/oembed_video?url=#{source_url}" \
15
+ "&access_token=#{access_token}"
15
16
  end
16
17
 
17
18
  # The video URL of the element
@@ -19,6 +20,21 @@ module ArticleJSON
19
20
  def source_url
20
21
  "https://www.facebook.com/facebook/videos/#{@element.embed_id}"
21
22
  end
23
+
24
+ # The facebook access token. If not set, it raises an exception
25
+ # explaining how to configure it.
26
+ #
27
+ # @return [String]
28
+ def access_token
29
+ token = ArticleJSON.configuration.facebook_token
30
+
31
+ if token.nil?
32
+ raise 'You need to configure the facebook token to use facebook' \
33
+ 'embed videos, see:' \
34
+ 'https://github.com/Devex/article_json#facebook-oembed'
35
+ end
36
+ token
37
+ end
22
38
  end
23
39
  end
24
40
  end
@@ -11,7 +11,7 @@ module ArticleJSON
11
11
  # The URL for the oembed API call
12
12
  # @return [String]
13
13
  def oembed_url
14
- "http://www.youtube.com/oembed?format=json&url=#{source_url}"
14
+ "https://www.youtube.com/oembed?format=json&url=#{source_url}"
15
15
  end
16
16
 
17
17
  # The video URL of the element
@@ -1,3 +1,3 @@
1
1
  module ArticleJSON
2
- VERSION = '0.3.8'
2
+ VERSION = '0.4.0'
3
3
  end
data/lib/article_json.rb CHANGED
@@ -77,6 +77,17 @@ require_relative 'article_json/export/html/elements/quote'
77
77
  require_relative 'article_json/export/html/elements/embed'
78
78
  require_relative 'article_json/export/html/exporter'
79
79
 
80
+ require_relative 'article_json/export/apple_news/elements/base'
81
+ require_relative 'article_json/export/apple_news/elements/text'
82
+ require_relative 'article_json/export/apple_news/elements/heading'
83
+ require_relative 'article_json/export/apple_news/elements/paragraph'
84
+ require_relative 'article_json/export/apple_news/elements/list'
85
+ require_relative 'article_json/export/apple_news/elements/image'
86
+ require_relative 'article_json/export/apple_news/elements/embed'
87
+ require_relative 'article_json/export/apple_news/elements/quote'
88
+ require_relative 'article_json/export/apple_news/elements/text_box'
89
+ require_relative 'article_json/export/apple_news/exporter'
90
+
80
91
  require_relative 'article_json/export/amp/elements/base'
81
92
  require_relative 'article_json/export/amp/elements/text'
82
93
  require_relative 'article_json/export/amp/elements/paragraph'