feed_processor_utils 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,44 @@
1
+ :attributes:
2
+ :title:
3
+ :keys:
4
+ - :atom_title
5
+ - :title
6
+ :description:
7
+ :keys:
8
+ - :meta_desc
9
+ :pics:
10
+ :keys:
11
+ - :image
12
+ - :og_image
13
+ :time:
14
+ :keys:
15
+ - :atom_updated
16
+ - :atom_published
17
+ - :date
18
+ :helper: :time_with_default
19
+ :url:
20
+ :keys:
21
+ - atom_link
22
+ - link
23
+ :text:
24
+ :keys:
25
+ - :atom_text
26
+ - :content
27
+ :helper: :clean_text
28
+ :replacements:
29
+ - - !ruby/regexp /\<a.+href.+data:.+base64,.+\<\/a\>/
30
+ - ''
31
+ - - !ruby/regexp /\<script.+\<\/script\>/
32
+ - ''
33
+ - - !ruby/regexp /<img\/?[^>]+?>/
34
+ - ''
35
+ - - !ruby/regexp /(<\s*p\s*\w*>\s*&nbsp;\s*<\s*\/p\s*>)+/
36
+ - <br />
37
+ - - !ruby/regexp /(\s*\n\s*)+/
38
+ - <br />
39
+ - - !ruby/regexp /(\<\/?\s*br\s*\/?\>)+/
40
+ - <br />
41
+ - - !ruby/regexp /<\s*\/\s*p\s*><br \/>/
42
+ - </p>
43
+ - - !ruby/regexp /<a /
44
+ - ! '<a rel="nofollow" '
@@ -0,0 +1,51 @@
1
+ :real_title:
2
+ :selectors:
3
+ - title
4
+ :fallback_text: true
5
+ :page_title:
6
+ :selectors:
7
+ - meta[name='og:title']
8
+ - meta[property='og:title']
9
+ - title
10
+ :attribute: content
11
+ :fallback_text: true
12
+ :meta_desc:
13
+ :selectors:
14
+ - meta[name='og:description']
15
+ - meta[property='og:description']
16
+ - meta[name='description']
17
+ - meta[name='Description']
18
+ :attribute: content
19
+ :video_url:
20
+ :collection: true
21
+ :selectors:
22
+ - embed
23
+ - iframe
24
+ :attribute: src
25
+ :video_site:
26
+ :selectors:
27
+ - .video_holder img.video_thumbnail
28
+ :attribute: video-site
29
+ :video_id:
30
+ :selectors:
31
+ - .video_holder img.video_thumbnail
32
+ :attribute: data-video
33
+ :image:
34
+ :selectors:
35
+ - img.entry-image
36
+ :attribute: src
37
+ :lazy_image_tags:
38
+ :collection: true
39
+ :selectors:
40
+ - img
41
+ :attribute: data-src
42
+ :image_tags:
43
+ :collection: true
44
+ :selectors:
45
+ - img
46
+ :attribute: src
47
+ :og_image:
48
+ :attribute: content
49
+ :selectors:
50
+ - meta[property='og:image']
51
+
@@ -0,0 +1,115 @@
1
+ require 'newer_image_size'
2
+ require 'image_size'
3
+
4
+ module FeedProcessorUtils
5
+ class FeedPostBuilder
6
+
7
+ @@config_file = File.join(File.dirname(__FILE__), 'config/feed_post_builder.yml')
8
+
9
+ def self.sanitize(text)
10
+ return nil unless text
11
+ replacements.each do |pattern, replacement|
12
+ text.gsub!(pattern, replacement)
13
+ end
14
+ text
15
+ end
16
+
17
+ def self.ensure_absolute(url, host)
18
+ url[0] == "/" ? "http://"+ host + url : url
19
+ end
20
+
21
+ def self.match_based64?(uri)
22
+ /data:\w+\/\w+;base64,/.match(uri)
23
+ end
24
+
25
+ def self.longest_content(*strings)
26
+ strings.sort_by! do |string|
27
+ string.to_s.gsub(/[\n\r]/,' ').gsub(/&nbsp;/, " ").gsub(/(<\/?[^<>]+>)/,' ').gsub(/\s{2,}/, ' ').gsub(/ / ,' ').split(' ').size
28
+ end
29
+ strings.last
30
+ end
31
+
32
+ def self.get_image_dimensions(uri)
33
+ match = match_based64?(uri)
34
+ if match
35
+ data = Base64.decode64(uri.split(match[0])[1])
36
+ else
37
+ begin
38
+ file = open(uri)
39
+ rescue => e
40
+ return [0,0]
41
+ end
42
+ data = file.read
43
+ end
44
+
45
+ if file && data.encoding.name == "UTF-8"
46
+ NewerImageSize.new(file).size
47
+ else
48
+ ImageSize.new(data).get_size
49
+ end
50
+
51
+ end
52
+
53
+ def self.dimensions_ok?(dimensions)
54
+ ratio = (0.5..2.5).cover?(dimensions.first.to_f / dimensions.last.to_f) unless dimensions.last.to_i == 0
55
+ ratio ||= false
56
+
57
+ ratio && have_minimum_size?(dimensions)
58
+ end
59
+
60
+ def self.have_minimum_size?(dimensions)
61
+ dimensions.first.to_i >= 300 && dimensions.last.to_i >= 150
62
+ end
63
+
64
+ def self.get_images(item_data, is_news)
65
+ image_urls = []
66
+ domain = URI.parse(item_data[:url] || item_data[:id])
67
+ [:og_image, :image].each do |key|
68
+ url = item_data[key]
69
+ if url
70
+ url = ensure_absolute(url.to_s, domain.to_s)
71
+ if have_minimum_size?(get_image_dimensions(url))
72
+ break if image_urls << url
73
+ end
74
+ end
75
+ end
76
+ nominated_images = if item_data[:lazy_image_tags].present?
77
+ item_data[:lazy_image_tags]
78
+ else
79
+ item_data[:images_in_text]
80
+ end
81
+ with_size = nominated_images.map do |url|
82
+ {url: url, dim: get_image_dimensions(url)}
83
+ end
84
+ largest_img = with_size.sort_by do |img|
85
+ dim = img[:dim]
86
+ dim ? dim[0] * dim[1] : 0
87
+ end.last
88
+ if largest_img
89
+ if is_news
90
+ image_urls << largest_img[:url]
91
+ else
92
+ image_urls.unshift(largest_img[:url]) if largest_img[:dim] && dimensions_ok?(largest_img[:dim])
93
+ end
94
+ end
95
+ image_urls
96
+ end
97
+
98
+ def self.get_videos(item_data)
99
+ item_data[:videos_in_text].map do |video|
100
+ video = "http:" + video if video[0..1] == "//" # sometimes the protocol is omitted from video url
101
+ video
102
+ end
103
+ end
104
+
105
+
106
+ def self.config
107
+ @@config ||= YAML.load(File.read(@@config_file))
108
+ end
109
+
110
+ def self.replacements
111
+ config[:replacements]
112
+ end
113
+
114
+ end
115
+ end
@@ -0,0 +1,61 @@
1
+ require 'open-uri'
2
+
3
+ module FeedProcessorUtils
4
+
5
+ class HTMLParser
6
+
7
+ @@default_config = File.join(File.dirname(__FILE__), "config/html_parser.yml")
8
+
9
+ def initialize(config_file = nil)
10
+ config_file ||= @@default_config
11
+ @config = YAML.load(File.read(config_file))
12
+ end
13
+
14
+ def parse_data(input)
15
+ input_doc = Nokogiri::HTML(input)
16
+ Hash[
17
+ fields.map do |field_name, parsing_data|
18
+ [field_name, extract_field(input_doc, parsing_data)]
19
+ end
20
+ ]
21
+ end
22
+
23
+ def parse_url(url)
24
+ input = open(url).read
25
+ parse_data(input)
26
+ end
27
+
28
+ private
29
+
30
+ def extract_field(input_doc, parsing_data)
31
+ if parsing_data[:collection]
32
+ collection = []
33
+ parsing_data[:selectors].each do |selector|
34
+ elements = input_doc.css(selector)
35
+ elements.each do |element|
36
+ if element[parsing_data[:attribute]]
37
+ collection << element[parsing_data[:attribute]]
38
+ elsif parsing_data[:fallback_text]
39
+ collection << element.text
40
+ end
41
+ end
42
+ end
43
+ collection
44
+ else
45
+ parsing_data[:selectors].each do |selector|
46
+ element = input_doc.at_css(selector)
47
+ if element
48
+ return element[parsing_data[:attribute]] if element[parsing_data[:attribute]]
49
+ return element.text if parsing_data[:fallback_text]
50
+ end
51
+ end
52
+ nil
53
+ end
54
+ end
55
+
56
+ def fields
57
+ @config
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,2 @@
1
+ require 'feed_processor_utils/feed_post_builder'
2
+ require 'feed_processor_utils/html_parser'
metadata ADDED
@@ -0,0 +1,98 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: feed_processor_utils
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - FTBpro
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-08-27 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: imagesize
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: newer_image_size
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ description: utility classes to work with feeds
63
+ email: gashaw@ftbpro.com
64
+ executables: []
65
+ extensions: []
66
+ extra_rdoc_files: []
67
+ files:
68
+ - lib/feed_processor_utils.rb
69
+ - lib/feed_processor_utils/feed_post_builder.rb
70
+ - lib/feed_processor_utils/html_parser.rb
71
+ - lib/feed_processor_utils/config/html_parser.yml
72
+ - lib/feed_processor_utils/config/feed_post_builder.yml
73
+ homepage: http://rubygems.org/gems/feed_processor_utils
74
+ licenses:
75
+ - MIT
76
+ post_install_message:
77
+ rdoc_options: []
78
+ require_paths:
79
+ - lib
80
+ required_ruby_version: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ required_rubygems_version: !ruby/object:Gem::Requirement
87
+ none: false
88
+ requirements:
89
+ - - ! '>='
90
+ - !ruby/object:Gem::Version
91
+ version: '0'
92
+ requirements: []
93
+ rubyforge_project:
94
+ rubygems_version: 1.8.24
95
+ signing_key:
96
+ specification_version: 3
97
+ summary: Feed Processing toolbox
98
+ test_files: []