article_json 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +12 -0
  3. data/LICENSE +21 -0
  4. data/README.md +78 -0
  5. data/bin/article_json_export_google_doc.rb +22 -0
  6. data/bin/article_json_export_html.rb +14 -0
  7. data/bin/article_json_parse_google_doc.rb +14 -0
  8. data/bin/update_reference_document.sh +18 -0
  9. data/lib/article_json/article.rb +53 -0
  10. data/lib/article_json/configuration.rb +24 -0
  11. data/lib/article_json/elements/base.rb +40 -0
  12. data/lib/article_json/elements/embed.rb +58 -0
  13. data/lib/article_json/elements/heading.rb +37 -0
  14. data/lib/article_json/elements/image.rb +41 -0
  15. data/lib/article_json/elements/list.rb +37 -0
  16. data/lib/article_json/elements/paragraph.rb +31 -0
  17. data/lib/article_json/elements/quote.rb +41 -0
  18. data/lib/article_json/elements/text.rb +45 -0
  19. data/lib/article_json/elements/text_box.rb +37 -0
  20. data/lib/article_json/export/html/elements/base.rb +59 -0
  21. data/lib/article_json/export/html/elements/embed.rb +28 -0
  22. data/lib/article_json/export/html/elements/heading.rb +19 -0
  23. data/lib/article_json/export/html/elements/image.rb +33 -0
  24. data/lib/article_json/export/html/elements/list.rb +25 -0
  25. data/lib/article_json/export/html/elements/paragraph.rb +17 -0
  26. data/lib/article_json/export/html/elements/quote.rb +29 -0
  27. data/lib/article_json/export/html/elements/shared/caption.rb +22 -0
  28. data/lib/article_json/export/html/elements/shared/float.rb +17 -0
  29. data/lib/article_json/export/html/elements/text.rb +44 -0
  30. data/lib/article_json/export/html/elements/text_box.rb +25 -0
  31. data/lib/article_json/export/html/exporter.rb +22 -0
  32. data/lib/article_json/import/google_doc/html/css_analyzer.rb +144 -0
  33. data/lib/article_json/import/google_doc/html/embedded_facebook_video_parser.rb +33 -0
  34. data/lib/article_json/import/google_doc/html/embedded_parser.rb +113 -0
  35. data/lib/article_json/import/google_doc/html/embedded_slideshare_parser.rb +36 -0
  36. data/lib/article_json/import/google_doc/html/embedded_tweet_parser.rb +37 -0
  37. data/lib/article_json/import/google_doc/html/embedded_vimeo_video_parser.rb +29 -0
  38. data/lib/article_json/import/google_doc/html/embedded_youtube_video_parser.rb +33 -0
  39. data/lib/article_json/import/google_doc/html/heading_parser.rb +38 -0
  40. data/lib/article_json/import/google_doc/html/image_parser.rb +75 -0
  41. data/lib/article_json/import/google_doc/html/list_parser.rb +46 -0
  42. data/lib/article_json/import/google_doc/html/node_analyzer.rb +111 -0
  43. data/lib/article_json/import/google_doc/html/paragraph_parser.rb +26 -0
  44. data/lib/article_json/import/google_doc/html/parser.rb +125 -0
  45. data/lib/article_json/import/google_doc/html/quote_parser.rb +46 -0
  46. data/lib/article_json/import/google_doc/html/shared/caption.rb +20 -0
  47. data/lib/article_json/import/google_doc/html/shared/float.rb +21 -0
  48. data/lib/article_json/import/google_doc/html/text_box_parser.rb +49 -0
  49. data/lib/article_json/import/google_doc/html/text_parser.rb +89 -0
  50. data/lib/article_json/utils/o_embed_resolver/base.rb +63 -0
  51. data/lib/article_json/utils/o_embed_resolver/facebook_video.rb +21 -0
  52. data/lib/article_json/utils/o_embed_resolver/slideshare.rb +22 -0
  53. data/lib/article_json/utils/o_embed_resolver/tweet.rb +23 -0
  54. data/lib/article_json/utils/o_embed_resolver/vimeo_video.rb +21 -0
  55. data/lib/article_json/utils/o_embed_resolver/youtube_video.rb +21 -0
  56. data/lib/article_json/utils.rb +11 -0
  57. data/lib/article_json/version.rb +3 -0
  58. data/lib/article_json.rb +55 -0
  59. metadata +189 -0
@@ -0,0 +1,89 @@
1
+ module ArticleJSON
2
+ module Import
3
+ module GoogleDoc
4
+ module HTML
5
+ class TextParser
6
+ # @param [Nokogiri::HTML::Node] node
7
+ # @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer
8
+ def initialize(node:, css_analyzer:)
9
+ @node = node
10
+ @css_analyzer = css_analyzer
11
+ end
12
+
13
+ # The content of the text node, w/o any markup
14
+ # @return [String]
15
+ def content
16
+ @node.inner_text
17
+ end
18
+
19
+ # Check if the text node is styled as bold
20
+ # @return [Boolean]
21
+ def bold?
22
+ @node.name == 'span' &&
23
+ @node.has_attribute?('class') &&
24
+ @css_analyzer.bold?(@node.attribute('class').value)
25
+ end
26
+
27
+ # Check if the text node is styled as italic
28
+ # @return [Boolean]
29
+ def italic?
30
+ @node.name == 'span' &&
31
+ @node.has_attribute?('class') &&
32
+ @css_analyzer.italic?(@node.attribute('class').value)
33
+ end
34
+
35
+ # A possible link target for the text, otherwise `nil`
36
+ # Google redirects (basically all links in a google doc html export)
37
+ # are stripped.
38
+ # @return [String]
39
+ def href
40
+ if @node.name == 'span' &&
41
+ @node.first_element_child&.name == 'a' &&
42
+ @node.first_element_child&.has_attribute?('href')
43
+ strip_google_redirect(
44
+ @node.first_element_child.attribute('href').value
45
+ )
46
+ end
47
+ end
48
+
49
+ # @return [ArticleJSON::Elements::Text]
50
+ def element
51
+ ArticleJSON::Elements::Text.new(
52
+ content: content,
53
+ bold: bold?,
54
+ italic: italic?,
55
+ href: href
56
+ )
57
+ end
58
+
59
+ class << self
60
+ # Extract multiple text nodes from a wrapping node
61
+ # The wrapping node is usually a paragraph or caption
62
+ # @param [Nokogiri::HTML::Node] node
63
+ # @param [ArticleJSON::Import::GoogleDoc::HTML::CSSAnalyzer] css_analyzer
64
+ # @return [Array[ArticleJSON::Elements::Text]]
65
+ def extract(node:, css_analyzer:)
66
+ node.children.map do |child_node|
67
+ next if NodeAnalyzer.new(child_node).empty?
68
+ new(node: child_node, css_analyzer: css_analyzer).element
69
+ end.compact
70
+ end
71
+ end
72
+
73
+ private
74
+
75
+ # @param [String] url
76
+ # @return [String]
77
+ def strip_google_redirect(url)
78
+ uri = URI(url)
79
+ if uri.host && uri.host.match(/google\.com/) && uri.path == '/url'
80
+ params = CGI.parse(uri.query)
81
+ return params['q'].first if params['q'] && params['q'].any?
82
+ end
83
+ url
84
+ end
85
+ end
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,63 @@
1
+ module ArticleJSON
2
+ module Utils
3
+ module OEmbedResolver
4
+ class Base
5
+ # @param [ArticleJSON::Elements::Embed] embed_element
6
+ def initialize(embed_element)
7
+ @element = embed_element
8
+ end
9
+
10
+ # @return [Hash]
11
+ def oembed_data
12
+ resolver = self.class == Base ? self.class.build(@element) : self
13
+ resolver.parsed_api_response
14
+ end
15
+
16
+ protected
17
+
18
+ # @return [Hash]
19
+ def parsed_api_response
20
+ @api_response ||= begin
21
+ uri = URI.parse(oembed_url)
22
+ http = Net::HTTP.new(uri.host, uri.port)
23
+ http.use_ssl = (uri.scheme == 'https')
24
+ response = http.request(Net::HTTP::Get.new(uri, http_headers))
25
+ JSON.parse(response.body, symbolize_names: true)
26
+ end
27
+ end
28
+
29
+ # @return [Hash]
30
+ def http_headers
31
+ headers = { 'Content-Type' => 'application/json' }
32
+ unless ArticleJSON.configuration.oembed_user_agent.nil?
33
+ headers['User-Agent'] = ArticleJSON.configuration.oembed_user_agent
34
+ end
35
+ headers
36
+ end
37
+
38
+ class << self
39
+ # Instantiate the correct sub class for a given element
40
+ # @param [ArticleJSON::Elements::Embed] embed_element
41
+ # @return [ArticleJSON::Utils::OEmbedResolver::Base]
42
+ def build(embed_element)
43
+ resolver = resolver_by_embed_type(embed_element.embed_type)
44
+ resolver.new(embed_element) unless resolver.nil?
45
+ end
46
+
47
+ # Lookup the correct sub class for a given element type
48
+ # @param [Symbol] :type
49
+ # @return [ArticleJSON::Utils::OEmbedResolver::Base]
50
+ def resolver_by_embed_type(type)
51
+ {
52
+ facebook_video: FacebookVideo,
53
+ slideshare: Slideshare,
54
+ tweet: Tweet,
55
+ vimeo_video: VimeoVideo,
56
+ youtube_video: YoutubeVideo,
57
+ }[type.to_sym]
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,21 @@
1
+ module ArticleJSON
2
+ module Utils
3
+ module OEmbedResolver
4
+ class FacebookVideo < Base
5
+ # The URL for the oembed API call
6
+ # @return [String]
7
+ def oembed_url
8
+ "https://www.facebook.com/plugins/video/oembed.json?url=#{video_url}"
9
+ end
10
+
11
+ private
12
+
13
+ # The video URL of the element
14
+ # @return [String]
15
+ def video_url
16
+ "facebook.com/facebook/videos/#{@element.embed_id}"
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,22 @@
1
+ module ArticleJSON
2
+ module Utils
3
+ module OEmbedResolver
4
+ class Slideshare < Base
5
+ # The URL for the oembed API call
6
+ # @return [String]
7
+ def oembed_url
8
+ "https://www.slideshare.net/api/oembed/2?format=json&url=#{slide_url}"
9
+ end
10
+
11
+ private
12
+
13
+ # The URL of the slideshow
14
+ # @return [String]
15
+ def slide_url
16
+ handle, slug = @element.embed_id.split('/', 2)
17
+ "www.slideshare.net/#{handle}/#{slug}"
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,23 @@
1
+ module ArticleJSON
2
+ module Utils
3
+ module OEmbedResolver
4
+ class Tweet < Base
5
+ # The URL for the oembed API call
6
+ # @return [String]
7
+ def oembed_url
8
+ 'https://api.twitter.com/1/statuses/oembed.json?align=center' \
9
+ "&url=#{tweet_url}"
10
+ end
11
+
12
+ private
13
+
14
+ # The URL of the tweet
15
+ # @return [String]
16
+ def tweet_url
17
+ handle, tweet_id = @element.embed_id.split('/', 2)
18
+ "https://twitter.com/#{handle}/status/#{tweet_id}"
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,21 @@
1
+ module ArticleJSON
2
+ module Utils
3
+ module OEmbedResolver
4
+ class VimeoVideo < Base
5
+ # The URL for the oembed API call
6
+ # @return [String]
7
+ def oembed_url
8
+ "https://vimeo.com/api/oembed.json?url=#{video_url}"
9
+ end
10
+
11
+ private
12
+
13
+ # The video URL of the element
14
+ # @return [String]
15
+ def video_url
16
+ "https://vimeo.com/#{@element.embed_id}"
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,21 @@
1
+ module ArticleJSON
2
+ module Utils
3
+ module OEmbedResolver
4
+ class YoutubeVideo < Base
5
+ # The URL for the oembed API call
6
+ # @return [String]
7
+ def oembed_url
8
+ "http://www.youtube.com/oembed?format=json&url=#{video_url}"
9
+ end
10
+
11
+ private
12
+
13
+ # The video URL of the element
14
+ # @return [String]
15
+ def video_url
16
+ "youtube.com/watch?v=#{@element.embed_id}"
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,11 @@
1
+ module ArticleJSON
2
+ module Utils
3
+ end
4
+ end
5
+
6
+ require_relative 'utils/o_embed_resolver/base'
7
+ require_relative 'utils/o_embed_resolver/facebook_video'
8
+ require_relative 'utils/o_embed_resolver/slideshare'
9
+ require_relative 'utils/o_embed_resolver/tweet'
10
+ require_relative 'utils/o_embed_resolver/vimeo_video'
11
+ require_relative 'utils/o_embed_resolver/youtube_video'
@@ -0,0 +1,3 @@
1
+ module ArticleJSON
2
+ VERSION = '0.1.0'
3
+ end
@@ -0,0 +1,55 @@
1
+ require 'uri'
2
+ require 'cgi'
3
+ require 'json'
4
+ require 'net/http'
5
+
6
+ require 'nokogiri'
7
+ require 'css_parser'
8
+
9
+ require_relative 'article_json/version'
10
+ require_relative 'article_json/configuration'
11
+ require_relative 'article_json/utils'
12
+
13
+ require_relative 'article_json/elements/base'
14
+ require_relative 'article_json/elements/text'
15
+ require_relative 'article_json/elements/heading'
16
+ require_relative 'article_json/elements/paragraph'
17
+ require_relative 'article_json/elements/list'
18
+ require_relative 'article_json/elements/image'
19
+ require_relative 'article_json/elements/text_box'
20
+ require_relative 'article_json/elements/quote'
21
+ require_relative 'article_json/elements/embed'
22
+
23
+ require_relative 'article_json/import/google_doc/html/shared/caption'
24
+ require_relative 'article_json/import/google_doc/html/shared/float'
25
+ require_relative 'article_json/import/google_doc/html/css_analyzer'
26
+ require_relative 'article_json/import/google_doc/html/node_analyzer'
27
+ require_relative 'article_json/import/google_doc/html/text_parser'
28
+ require_relative 'article_json/import/google_doc/html/heading_parser'
29
+ require_relative 'article_json/import/google_doc/html/paragraph_parser'
30
+ require_relative 'article_json/import/google_doc/html/list_parser'
31
+ require_relative 'article_json/import/google_doc/html/image_parser'
32
+ require_relative 'article_json/import/google_doc/html/text_box_parser'
33
+ require_relative 'article_json/import/google_doc/html/quote_parser'
34
+ require_relative 'article_json/import/google_doc/html/embedded_parser'
35
+ require_relative 'article_json/import/google_doc/html/embedded_facebook_video_parser'
36
+ require_relative 'article_json/import/google_doc/html/embedded_vimeo_video_parser'
37
+ require_relative 'article_json/import/google_doc/html/embedded_youtube_video_parser'
38
+ require_relative 'article_json/import/google_doc/html/embedded_slideshare_parser'
39
+ require_relative 'article_json/import/google_doc/html/embedded_tweet_parser'
40
+ require_relative 'article_json/import/google_doc/html/parser'
41
+
42
+ require_relative 'article_json/export/html/elements/shared/caption'
43
+ require_relative 'article_json/export/html/elements/shared/float'
44
+ require_relative 'article_json/export/html/elements/base'
45
+ require_relative 'article_json/export/html/elements/text'
46
+ require_relative 'article_json/export/html/elements/heading'
47
+ require_relative 'article_json/export/html/elements/paragraph'
48
+ require_relative 'article_json/export/html/elements/list'
49
+ require_relative 'article_json/export/html/elements/image'
50
+ require_relative 'article_json/export/html/elements/text_box'
51
+ require_relative 'article_json/export/html/elements/quote'
52
+ require_relative 'article_json/export/html/elements/embed'
53
+ require_relative 'article_json/export/html/exporter'
54
+
55
+ require_relative 'article_json/article'
metadata ADDED
@@ -0,0 +1,189 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: article_json
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - "@dsager"
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-09-20 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.8'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.8'
27
+ - !ruby/object:Gem::Dependency
28
+ name: css_parser
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.5'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.5'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.15'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.15'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.6'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.6'
69
+ - !ruby/object:Gem::Dependency
70
+ name: webmock
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: coveralls
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '0.8'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.8'
97
+ description: |
98
+ `article_json` is a format definition for news articles and a ruby gem that
99
+ offers conversions from and to different formats:
100
+ - Parser for Google Doc HTML exports
101
+ - Converter to simple HTML format
102
+ - Converter to AMP format
103
+ email: info@devex.com
104
+ executables: []
105
+ extensions: []
106
+ extra_rdoc_files: []
107
+ files:
108
+ - CHANGELOG.md
109
+ - LICENSE
110
+ - README.md
111
+ - bin/article_json_export_google_doc.rb
112
+ - bin/article_json_export_html.rb
113
+ - bin/article_json_parse_google_doc.rb
114
+ - bin/update_reference_document.sh
115
+ - lib/article_json.rb
116
+ - lib/article_json/article.rb
117
+ - lib/article_json/configuration.rb
118
+ - lib/article_json/elements/base.rb
119
+ - lib/article_json/elements/embed.rb
120
+ - lib/article_json/elements/heading.rb
121
+ - lib/article_json/elements/image.rb
122
+ - lib/article_json/elements/list.rb
123
+ - lib/article_json/elements/paragraph.rb
124
+ - lib/article_json/elements/quote.rb
125
+ - lib/article_json/elements/text.rb
126
+ - lib/article_json/elements/text_box.rb
127
+ - lib/article_json/export/html/elements/base.rb
128
+ - lib/article_json/export/html/elements/embed.rb
129
+ - lib/article_json/export/html/elements/heading.rb
130
+ - lib/article_json/export/html/elements/image.rb
131
+ - lib/article_json/export/html/elements/list.rb
132
+ - lib/article_json/export/html/elements/paragraph.rb
133
+ - lib/article_json/export/html/elements/quote.rb
134
+ - lib/article_json/export/html/elements/shared/caption.rb
135
+ - lib/article_json/export/html/elements/shared/float.rb
136
+ - lib/article_json/export/html/elements/text.rb
137
+ - lib/article_json/export/html/elements/text_box.rb
138
+ - lib/article_json/export/html/exporter.rb
139
+ - lib/article_json/import/google_doc/html/css_analyzer.rb
140
+ - lib/article_json/import/google_doc/html/embedded_facebook_video_parser.rb
141
+ - lib/article_json/import/google_doc/html/embedded_parser.rb
142
+ - lib/article_json/import/google_doc/html/embedded_slideshare_parser.rb
143
+ - lib/article_json/import/google_doc/html/embedded_tweet_parser.rb
144
+ - lib/article_json/import/google_doc/html/embedded_vimeo_video_parser.rb
145
+ - lib/article_json/import/google_doc/html/embedded_youtube_video_parser.rb
146
+ - lib/article_json/import/google_doc/html/heading_parser.rb
147
+ - lib/article_json/import/google_doc/html/image_parser.rb
148
+ - lib/article_json/import/google_doc/html/list_parser.rb
149
+ - lib/article_json/import/google_doc/html/node_analyzer.rb
150
+ - lib/article_json/import/google_doc/html/paragraph_parser.rb
151
+ - lib/article_json/import/google_doc/html/parser.rb
152
+ - lib/article_json/import/google_doc/html/quote_parser.rb
153
+ - lib/article_json/import/google_doc/html/shared/caption.rb
154
+ - lib/article_json/import/google_doc/html/shared/float.rb
155
+ - lib/article_json/import/google_doc/html/text_box_parser.rb
156
+ - lib/article_json/import/google_doc/html/text_parser.rb
157
+ - lib/article_json/utils.rb
158
+ - lib/article_json/utils/o_embed_resolver/base.rb
159
+ - lib/article_json/utils/o_embed_resolver/facebook_video.rb
160
+ - lib/article_json/utils/o_embed_resolver/slideshare.rb
161
+ - lib/article_json/utils/o_embed_resolver/tweet.rb
162
+ - lib/article_json/utils/o_embed_resolver/vimeo_video.rb
163
+ - lib/article_json/utils/o_embed_resolver/youtube_video.rb
164
+ - lib/article_json/version.rb
165
+ homepage: https://github.com/Devex/article_json
166
+ licenses:
167
+ - MIT
168
+ metadata: {}
169
+ post_install_message:
170
+ rdoc_options: []
171
+ require_paths:
172
+ - lib
173
+ required_ruby_version: !ruby/object:Gem::Requirement
174
+ requirements:
175
+ - - ">="
176
+ - !ruby/object:Gem::Version
177
+ version: '2.3'
178
+ required_rubygems_version: !ruby/object:Gem::Requirement
179
+ requirements:
180
+ - - ">="
181
+ - !ruby/object:Gem::Version
182
+ version: '0'
183
+ requirements: []
184
+ rubyforge_project:
185
+ rubygems_version: 2.5.1
186
+ signing_key:
187
+ specification_version: 4
188
+ summary: JSON Format for News Articles & Ruby Gem
189
+ test_files: []