article_json 0.3.7 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +31 -0
  3. data/README.md +118 -67
  4. data/bin/article_json_export_amp.rb +15 -0
  5. data/bin/article_json_export_apple_news.rb +15 -0
  6. data/bin/article_json_export_facebook.rb +16 -0
  7. data/bin/article_json_export_html.rb +1 -0
  8. data/bin/article_json_export_plain_text.rb +15 -0
  9. data/bin/article_json_parse_google_doc.rb +1 -0
  10. data/bin/check_google_doc_export.rb +41 -0
  11. data/bin/update_oembed_request-stubs.sh +11 -0
  12. data/bin/update_reference_document.sh +16 -0
  13. data/lib/article_json/article.rb +22 -2
  14. data/lib/article_json/configuration.rb +2 -1
  15. data/lib/article_json/elements/base.rb +0 -1
  16. data/lib/article_json/elements/image.rb +7 -3
  17. data/lib/article_json/export/amp/elements/embed.rb +1 -1
  18. data/lib/article_json/export/apple_news/elements/base.rb +53 -0
  19. data/lib/article_json/export/apple_news/elements/embed.rb +130 -0
  20. data/lib/article_json/export/apple_news/elements/heading.rb +32 -0
  21. data/lib/article_json/export/apple_news/elements/image.rb +58 -0
  22. data/lib/article_json/export/apple_news/elements/list.rb +67 -0
  23. data/lib/article_json/export/apple_news/elements/paragraph.rb +36 -0
  24. data/lib/article_json/export/apple_news/elements/quote.rb +60 -0
  25. data/lib/article_json/export/apple_news/elements/text.rb +42 -0
  26. data/lib/article_json/export/apple_news/elements/text_box.rb +51 -0
  27. data/lib/article_json/export/apple_news/exporter.rb +37 -0
  28. data/lib/article_json/export/common/html/elements/image.rb +1 -1
  29. data/lib/article_json/import/google_doc/html/image_parser.rb +24 -3
  30. data/lib/article_json/import/google_doc/html/node_analyzer.rb +11 -1
  31. data/lib/article_json/import/google_doc/html/parser.rb +6 -1
  32. data/lib/article_json/utils/o_embed_resolver/facebook_video.rb +17 -1
  33. data/lib/article_json/utils/o_embed_resolver/youtube_video.rb +1 -1
  34. data/lib/article_json/version.rb +1 -1
  35. data/lib/article_json.rb +11 -0
  36. metadata +37 -15
@@ -0,0 +1,53 @@
1
+ module ArticleJSON
2
+ module Export
3
+ module AppleNews
4
+ module Elements
5
+ class Base
6
+ include ArticleJSON::Export::Common::Elements::Base
7
+
8
+ # Export the given element. Dynamically looks up the right
9
+ # export-element-class, instantiates it and then calls the `#export`
10
+ # method.
11
+ # Defaults to nil, e.g. if no exporter is specified for the given
12
+ # type.
13
+ # @return [String]
14
+ def export
15
+ super || nil
16
+ end
17
+
18
+ class << self
19
+ # Return the module namespace this class and its subclasses are
20
+ # nested within.
21
+ # @return [Module]
22
+ def namespace
23
+ ArticleJSON::Export::AppleNews::Elements
24
+ end
25
+
26
+ private
27
+
28
+ # The format this exporter is returning. This is used to determine
29
+ # which custom element exporters should be applied from the
30
+ # configuration.
31
+ # @return [Symbol]
32
+ def export_format
33
+ :apple_news
34
+ end
35
+
36
+ def default_exporter_mapping
37
+ {
38
+ text: namespace::Text,
39
+ paragraph: namespace::Paragraph,
40
+ heading: namespace::Heading,
41
+ quote: namespace::Quote,
42
+ list: namespace::List,
43
+ image: namespace::Image,
44
+ embed: namespace::Embed,
45
+ text_box: namespace::TextBox,
46
+ }
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,130 @@
1
+ module ArticleJSON
2
+ module Export
3
+ module AppleNews
4
+ module Elements
5
+ class Embed < Base
6
+ # Embed| Embed, Caption
7
+ # @return [Hash, Array<Hash>]
8
+ def export
9
+ caption_text.nil? ? embed : [embed, caption]
10
+ end
11
+
12
+ private
13
+
14
+ # Embed
15
+ # @return [Hash]
16
+ def embed
17
+ {
18
+ role: role,
19
+ URL: source_url,
20
+ caption: caption_text,
21
+ }.compact
22
+ end
23
+
24
+ # Caption
25
+ # @return [Hash]
26
+ def caption
27
+ {
28
+ role: 'caption',
29
+ text: caption_text,
30
+ format: 'html',
31
+ layout: 'captionLayout',
32
+ textStyle: 'captionStyle',
33
+ }
34
+ end
35
+
36
+ # Get the exporter class for text elements
37
+ # @return [ArticleJSON::Export::Common::HTML::Elements::Base]
38
+ def text_exporter
39
+ self.class.exporter_by_type(:text)
40
+ end
41
+
42
+ # Caption Text
43
+ # @return [String]
44
+ def caption_text
45
+ return nil if role.nil? # Do not show captions for unsupported components
46
+
47
+ text.empty? ? nil : text
48
+ end
49
+
50
+ # @return [String]
51
+ def text
52
+ @element.caption.map do |child_element|
53
+ text_exporter.new(child_element)
54
+ .export
55
+ end.join
56
+ end
57
+
58
+ def role
59
+ @role ||=
60
+ case embed_type
61
+ when :youtube_video, :vimeo_video, :dailymotion_video
62
+ :embedwebvideo
63
+ when :facebook_video
64
+ :facebook_post
65
+ when :tweet
66
+ :tweet
67
+ when :slideshare
68
+ nil
69
+ when :soundcloud
70
+ nil
71
+ else
72
+ nil
73
+ end
74
+ end
75
+
76
+ def source_url
77
+ case embed_type
78
+ when :youtube_video
79
+ build_embeded_youtube_url
80
+ when :vimeo_video
81
+ build_embeded_vimeo_url
82
+ when :dailymotion_video
83
+ build_embeded_vimeo_url
84
+ when :facebook_video
85
+ build_facebook_video_url
86
+ when :tweet
87
+ build_twitter_url
88
+ when :slideshare
89
+ nil
90
+ when :soundcloud
91
+ nil
92
+ else
93
+ nil
94
+ end
95
+ end
96
+
97
+ def build_embeded_youtube_url
98
+ "https://www.youtube.com/embed/#{embed_id}"
99
+ end
100
+
101
+ def build_embeded_vimeo_url
102
+ "https://player.vimeo.com/video/#{embed_id}"
103
+ end
104
+
105
+ def build_embeded_dailymotion_url
106
+ "https://geo.dailymotion.com/player.html?video=#{embed_id}"
107
+ end
108
+
109
+ def build_facebook_video_url
110
+ username, id = embed_id.to_s.split("/", 2)
111
+ "https://www.facebook.com/#{username}/videos/#{id}"
112
+ end
113
+
114
+ def build_twitter_url
115
+ username, id = embed_id.to_s.split("/", 2)
116
+ "https://twitter.com/#{username}/status/#{id}"
117
+ end
118
+
119
+ def embed_type
120
+ @embed_type ||= @element.embed_type.to_sym
121
+ end
122
+
123
+ def embed_id
124
+ @embed_id ||= @element.embed_id.to_sym
125
+ end
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,32 @@
1
+ module ArticleJSON
2
+ module Export
3
+ module AppleNews
4
+ module Elements
5
+ class Heading < Base
6
+ # Headline
7
+ # @return [Hash]
8
+ def export
9
+ {
10
+ role: role,
11
+ text: @element.content,
12
+ layout: 'titleLayout',
13
+ textStyle: 'defaultTitle',
14
+ }
15
+ end
16
+
17
+ private
18
+
19
+ # The role of text component for adding a heading. (Required) Always
20
+ # one of these roles for this component: heading, heading1, heading2,
21
+ # heading3, heading4, heading5, or heading6.
22
+ # @return [String]
23
+ def role
24
+ return 'heading' if @element.level.nil?
25
+
26
+ "heading#{@element.level}"
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,58 @@
1
+ module ArticleJSON
2
+ module Export
3
+ module AppleNews
4
+ module Elements
5
+ class Image < Base
6
+ # Image | Image, Caption
7
+ # @return [Hash, Array<Hash>]
8
+ def export
9
+ caption_text.nil? ? image : [image, caption]
10
+ end
11
+
12
+ private
13
+ # Image
14
+ # @return [Hash]
15
+ def image
16
+ {
17
+ role: 'image',
18
+ URL: @element.source_url,
19
+ caption: caption_text,
20
+ }.compact
21
+ end
22
+
23
+ # Caption
24
+ # @return [Hash]
25
+ def caption
26
+ {
27
+ role: 'caption',
28
+ text: caption_text,
29
+ format: 'html',
30
+ layout: 'captionLayout',
31
+ textStyle: 'captionStyle',
32
+ }
33
+ end
34
+
35
+ # Get the exporter class for text elements
36
+ # @return [ArticleJSON::Export::Common::HTML::Elements::Base]
37
+ def text_exporter
38
+ self.class.exporter_by_type(:text)
39
+ end
40
+
41
+ # Caption Text
42
+ # @return [String]
43
+ def caption_text
44
+ text.empty? ? nil : text
45
+ end
46
+
47
+ # @return [String]
48
+ def text
49
+ @element.caption.map do |child_element|
50
+ text_exporter.new(child_element)
51
+ .export
52
+ end.join
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,67 @@
1
+ module ArticleJSON
2
+ module Export
3
+ module AppleNews
4
+ module Elements
5
+ class List < Base
6
+ # List
7
+ # @return [Hash]
8
+ def export
9
+ {
10
+ role: 'body',
11
+ text: list_text,
12
+ format: 'html',
13
+ layout: 'bodyLayout',
14
+ textStyle: 'bodyStyle',
15
+ }
16
+ end
17
+
18
+ private
19
+
20
+ # Get the exporter class for text elements
21
+ #
22
+ # @return [ArticleJSON::Export::Common::HTML::Elements::<Class>]
23
+ def text_exporter
24
+ self.class.exporter_by_type(:text)
25
+ end
26
+
27
+ # When it is an unordered list wrap it in <ul></ul>
28
+ # When it is an ordered list wrap it in <ol></ol>
29
+ #
30
+ # List Text
31
+ # @return [String]
32
+ def list_text
33
+ prepend_list_tag + list + append_list_tag
34
+ end
35
+
36
+ # Each list item should be wrapped in <li></li>
37
+ #
38
+ # @return [String]
39
+ def list
40
+ @element.content.map do |paragraph_element|
41
+ line_item = paragraph_element.content.map do |text_element|
42
+ text_exporter.new(text_element).export
43
+ end.join
44
+
45
+ "<li>#{line_item}</li>"
46
+ end.join
47
+ end
48
+
49
+ # @return [String]
50
+ def prepend_list_tag
51
+ ordered_list? ? '<ol>' : '<ul>'
52
+ end
53
+
54
+ # @return [String]
55
+ def append_list_tag
56
+ ordered_list? ? '</ol>' : '</ul>'
57
+ end
58
+
59
+ # @return [Boolean]
60
+ def ordered_list?
61
+ @element.list_type == :ordered
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,36 @@
1
+ module ArticleJSON
2
+ module Export
3
+ module AppleNews
4
+ module Elements
5
+ class Paragraph < Base
6
+ # Generate the paragraph node with its containing text elements
7
+ # @return [Hash]
8
+ def export
9
+ {
10
+ role: 'body',
11
+ text: text,
12
+ format: 'html',
13
+ layout: 'bodyLayout',
14
+ textStyle: 'bodyStyle',
15
+ }
16
+ end
17
+
18
+ private
19
+
20
+ # Get the exporter class for text elements
21
+ # @return [ArticleJSON::Export::Common::HTML::Elements::Base]
22
+ def text_exporter
23
+ self.class.exporter_by_type(:text)
24
+ end
25
+
26
+ def text
27
+ @element.content.map do |child_element|
28
+ text_exporter.new(child_element)
29
+ .export
30
+ end.join
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,60 @@
1
+ module ArticleJSON
2
+ module Export
3
+ module AppleNews
4
+ module Elements
5
+ class Quote < Base
6
+ include ArticleJSON::Export::Common::HTML::Elements::Base
7
+ include ArticleJSON::Export::Common::HTML::Elements::Text
8
+
9
+ def export
10
+ [quote, author]
11
+ end
12
+
13
+ private
14
+
15
+ # Quote
16
+ # @return [Hash]
17
+ def quote
18
+ {
19
+ role: 'pullquote',
20
+ text: quote_text,
21
+ format: 'html',
22
+ layout: 'pullquoteLayout',
23
+ textStyle: 'pullquoteStyle',
24
+ }
25
+ end
26
+
27
+ # Author
28
+ # @return [Hash]
29
+ def author
30
+ {
31
+ role: 'author',
32
+ text: author_text,
33
+ format: 'html',
34
+ layout: 'pullquoteAttributeLayout',
35
+ textStyle: 'quoteAttributeStyle',
36
+ }
37
+ end
38
+
39
+ def text_exporter
40
+ self.class.exporter_by_type(:text)
41
+ end
42
+
43
+ # Quote Text
44
+ # @return [String]
45
+ def quote_text
46
+ element = @element.content.first&.content.first
47
+ text_exporter.new(element).export
48
+ end
49
+
50
+ # Author Text
51
+ # @return [String]
52
+ def author_text
53
+ element = @element.caption.first
54
+ text_exporter.new(element).export
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,42 @@
1
+ module ArticleJSON
2
+ module Export
3
+ module AppleNews
4
+ module Elements
5
+ class Text < Base
6
+ include ArticleJSON::Export::Common::HTML::Elements::Base
7
+ include ArticleJSON::Export::Common::HTML::Elements::Text
8
+
9
+ UNSUPPORTED_HTML_TAGS = %w[title meta script noscript style link applet object iframe
10
+ noframes form select option optgroup
11
+ ].freeze
12
+
13
+ # A Nokogiri object is returned with`super`, which is is then
14
+ # returned as a either a string or as HTML (when not plain text),
15
+ # both of which are compatible with Apple News format. Takes into
16
+ # account bold, italic and href.
17
+ # @return [String]
18
+ def export
19
+ super.to_s
20
+ end
21
+
22
+ # @param [String] text
23
+ def create_text_nodes(text)
24
+ Nokogiri::HTML.fragment(sanitize_text(text).gsub(/\n/, '<br>')).children
25
+ end
26
+
27
+ # Removes UNSUPPORTED_TAGS from text
28
+ #
29
+ # @param [String] text
30
+ # @return [String]
31
+ def sanitize_text(text)
32
+ doc = Nokogiri::HTML.fragment(text)
33
+ UNSUPPORTED_HTML_TAGS.each do |tag|
34
+ doc.search(tag).each(&:remove)
35
+ end
36
+ doc.inner_html
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,51 @@
1
+ module ArticleJSON
2
+ module Export
3
+ module AppleNews
4
+ module Elements
5
+ class TextBox < Base
6
+ include ArticleJSON::Export::Common::HTML::Elements::TextBox
7
+ # List
8
+ # @return [Hash]
9
+ def export
10
+ {
11
+ role: 'container',
12
+ layout: 'textBoxLayout',
13
+ style: 'textBoxStyle',
14
+ components: map_styles(elements),
15
+ }
16
+ end
17
+
18
+ private
19
+
20
+ # @return [Array]
21
+ def elements
22
+ @element.content.map do |child_element|
23
+ case child_element
24
+ when ArticleJSON::Elements::Heading
25
+ namespace::Heading.new(child_element).export
26
+ when ArticleJSON::Elements::Paragraph
27
+ namespace::Paragraph.new(child_element).export
28
+ when ArticleJSON::Elements::List
29
+ namespace::List.new(child_element).export
30
+ else
31
+ namespace::Text.new(child_element).export
32
+ end
33
+ end
34
+ end
35
+
36
+ # @return [Module]
37
+ def namespace
38
+ ArticleJSON::Export::AppleNews::Elements
39
+ end
40
+
41
+ # @return [Array]
42
+ def map_styles(elements)
43
+ elements.map do |child_element|
44
+ child_element.merge(layout: 'textBox' + child_element[:layout].sub(/\S/, &:upcase))
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,37 @@
1
+ module ArticleJSON
2
+ module Export
3
+ module AppleNews
4
+ class Exporter
5
+ # @param [Array[ArticleJSON::Elements::Base]] elements
6
+ def initialize(elements)
7
+ @elements = elements
8
+ end
9
+
10
+ # Return the components section of an Apple News Article as JSON
11
+ #
12
+ # Images and EmbededVideos are nested in an array with the components
13
+ # array when they contain captions. As Apple News skips over these
14
+ # nested arrays, we must flatten the array.
15
+ #
16
+ # @return [String]
17
+ def to_json
18
+ { components: components.flatten }.to_json
19
+ end
20
+
21
+ private
22
+
23
+ # Generate an array with the plain text representation of all elements
24
+ #
25
+ # @return [Array]
26
+ def components
27
+ @components ||=
28
+ @elements.map do |element|
29
+ ArticleJSON::Export::AppleNews::Elements::Base
30
+ .build(element)
31
+ &.export
32
+ end.reject { |hash| hash.nil? || hash.empty? }
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -29,7 +29,7 @@ module ArticleJSON
29
29
 
30
30
  # @return [Nokogiri::XML::NodeSet]
31
31
  def image_node
32
- create_element(:img, src: @element.source_url)
32
+ create_element(:img, src: @element.source_url, alt: @element.alt)
33
33
  end
34
34
 
35
35
  # @return [Nokogiri::XML::NodeSet]
@@ -19,22 +19,34 @@ module ArticleJSON
19
19
  @float_node = @node
20
20
  end
21
21
 
22
+ # The value of the image's `alt` attribute
23
+ # @return [String]
24
+ def alt
25
+ return '' if image_url?
26
+
27
+ image_node.attribute('alt')&.value || ''
28
+ end
29
+
22
30
  # The value of the image's `src` attribute
23
31
  # @return [String]
24
32
  def source_url
33
+ return @node.inner_text.strip if image_url?
34
+
25
35
  image_node.attribute('src').value
26
36
  end
27
37
 
28
38
  # The node of the actual image
29
39
  # @return [Nokogiri::HTML::Node]
30
40
  def image_node
31
- @node.xpath('.//img').first
41
+ return @image_node if defined? @image_node
42
+
43
+ @image_node = @node.xpath('.//img').first
32
44
  end
33
45
 
34
46
  # Check if the image is floating (left, right or not at all)
35
47
  # @return [Symbol]
36
48
  def float
37
- super if floatable_size?
49
+ super if image_url? || floatable_size?
38
50
  end
39
51
 
40
52
  # Extracts an href from the tag [image-link-to: url]) if present
@@ -54,7 +66,8 @@ module ArticleJSON
54
66
  source_url: source_url,
55
67
  float: float,
56
68
  caption: caption,
57
- href: @href
69
+ href: @href,
70
+ alt: alt
58
71
  )
59
72
  end
60
73
 
@@ -73,6 +86,7 @@ module ArticleJSON
73
86
  def href_regexp
74
87
  %r{\[image-link-to:\s+(?<url>.*?)\]}
75
88
  end
89
+
76
90
  # Check if the image's width can be determined and is less than 500px
77
91
  # This is about 3/4 of the google document width...
78
92
  # @return [Boolean]
@@ -94,6 +108,13 @@ module ArticleJSON
94
108
  match['px'].to_i if match && match['px']
95
109
  end
96
110
  end
111
+
112
+ # When the current node doesn't contain an actual image tag,
113
+ # we're dealing with an image URL
114
+ # @return [Boolean]
115
+ def image_url?
116
+ image_node.nil?
117
+ end
97
118
  end
98
119
  end
99
120
  end
@@ -88,7 +88,17 @@ module ArticleJSON
88
88
  # @return [Boolean]
89
89
  def image?
90
90
  return @is_image if defined? @is_image
91
- @is_image = node.xpath('.//img').length > 0
91
+ @is_image = image_url? || node.xpath('.//img').length > 0
92
+ end
93
+
94
+ # Check if the node contains an image URL
95
+ # @return [Boolean]
96
+ def image_url?
97
+ return @is_image_url if defined? @is_image_url
98
+
99
+ text = node.inner_text.strip
100
+ url_regexp = %r{https?:\/\/\S+\.(?:jpg|jpeg|png|gif)}i
101
+ @is_image_url = !!(url_regexp =~ text)
92
102
  end
93
103
 
94
104
  # Check if the node contains an embedded element
@@ -6,7 +6,12 @@ module ArticleJSON
6
6
  # @param [String] html
7
7
  def initialize(html)
8
8
  doc = Nokogiri::HTML(html)
9
- @body_enumerator = doc.xpath('//body').last.children.to_enum
9
+ selection = if doc.xpath('//body/div').empty?
10
+ doc.xpath('//body')
11
+ else
12
+ doc.xpath('//body/div')
13
+ end
14
+ @body_enumerator = selection.last.children.to_enum
10
15
 
11
16
  css_node = doc.xpath('//head/style').last
12
17
  @css_analyzer = CSSAnalyzer.new(css_node&.inner_text)