feed_processor_utils 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,44 @@
1
+ :attributes:
2
+ :title:
3
+ :keys:
4
+ - :atom_title
5
+ - :title
6
+ :description:
7
+ :keys:
8
+ - :meta_desc
9
+ :pics:
10
+ :keys:
11
+ - :image
12
+ - :og_image
13
+ :time:
14
+ :keys:
15
+ - :atom_updated
16
+ - :atom_published
17
+ - :date
18
+ :helper: :time_with_default
19
+ :url:
20
+ :keys:
21
+ - atom_link
22
+ - link
23
+ :text:
24
+ :keys:
25
+ - :atom_text
26
+ - :content
27
+ :helper: :clean_text
28
+ :replacements:
29
+ - - !ruby/regexp /\<a.+href.+data:.+base64,.+\<\/a\>/
30
+ - ''
31
+ - - !ruby/regexp /\<script.+\<\/script\>/
32
+ - ''
33
+ - - !ruby/regexp /<img\/?[^>]+?>/
34
+ - ''
35
+ - - !ruby/regexp /(<\s*p\s*\w*>\s*&nbsp;\s*<\s*\/p\s*>)+/
36
+ - <br />
37
+ - - !ruby/regexp /(\s*\n\s*)+/
38
+ - <br />
39
+ - - !ruby/regexp /(\<\/?\s*br\s*\/?\>)+/
40
+ - <br />
41
+ - - !ruby/regexp /<\s*\/\s*p\s*><br \/>/
42
+ - </p>
43
+ - - !ruby/regexp /<a /
44
+ - ! '<a rel="nofollow" '
@@ -0,0 +1,51 @@
1
+ :real_title:
2
+ :selectors:
3
+ - title
4
+ :fallback_text: true
5
+ :page_title:
6
+ :selectors:
7
+ - meta[name='og:title']
8
+ - meta[property='og:title']
9
+ - title
10
+ :attribute: content
11
+ :fallback_text: true
12
+ :meta_desc:
13
+ :selectors:
14
+ - meta[name='og:description']
15
+ - meta[property='og:description']
16
+ - meta[name='description']
17
+ - meta[name='Description']
18
+ :attribute: content
19
+ :video_url:
20
+ :collection: true
21
+ :selectors:
22
+ - embed
23
+ - iframe
24
+ :attribute: src
25
+ :video_site:
26
+ :selectors:
27
+ - .video_holder img.video_thumbnail
28
+ :attribute: video-site
29
+ :video_id:
30
+ :selectors:
31
+ - .video_holder img.video_thumbnail
32
+ :attribute: data-video
33
+ :image:
34
+ :selectors:
35
+ - img.entry-image
36
+ :attribute: src
37
+ :lazy_image_tags:
38
+ :collection: true
39
+ :selectors:
40
+ - img
41
+ :attribute: data-src
42
+ :image_tags:
43
+ :collection: true
44
+ :selectors:
45
+ - img
46
+ :attribute: src
47
+ :og_image:
48
+ :attribute: content
49
+ :selectors:
50
+ - meta[property='og:image']
51
+
@@ -0,0 +1,115 @@
1
+ require 'newer_image_size'
2
+ require 'image_size'
3
+
4
+ module FeedProcessorUtils
5
+ class FeedPostBuilder
6
+
7
+ @@config_file = File.join(File.dirname(__FILE__), 'config/feed_post_builder.yml')
8
+
9
+ def self.sanitize(text)
10
+ return nil unless text
11
+ replacements.each do |pattern, replacement|
12
+ text.gsub!(pattern, replacement)
13
+ end
14
+ text
15
+ end
16
+
17
+ def self.ensure_absolute(url, host)
18
+ url[0] == "/" ? "http://"+ host + url : url
19
+ end
20
+
21
+ def self.match_based64?(uri)
22
+ /data:\w+\/\w+;base64,/.match(uri)
23
+ end
24
+
25
+ def self.longest_content(*strings)
26
+ strings.sort_by! do |string|
27
+ string.to_s.gsub(/[\n\r]/,' ').gsub(/&nbsp;/, " ").gsub(/(<\/?[^<>]+>)/,' ').gsub(/\s{2,}/, ' ').gsub(/ / ,' ').split(' ').size
28
+ end
29
+ strings.last
30
+ end
31
+
32
+ def self.get_image_dimensions(uri)
33
+ match = match_based64?(uri)
34
+ if match
35
+ data = Base64.decode64(uri.split(match[0])[1])
36
+ else
37
+ begin
38
+ file = open(uri)
39
+ rescue => e
40
+ return [0,0]
41
+ end
42
+ data = file.read
43
+ end
44
+
45
+ if file && data.encoding.name == "UTF-8"
46
+ NewerImageSize.new(file).size
47
+ else
48
+ ImageSize.new(data).get_size
49
+ end
50
+
51
+ end
52
+
53
+ def self.dimensions_ok?(dimensions)
54
+ ratio = (0.5..2.5).cover?(dimensions.first.to_f / dimensions.last.to_f) unless dimensions.last.to_i == 0
55
+ ratio ||= false
56
+
57
+ ratio && have_minimum_size?(dimensions)
58
+ end
59
+
60
+ def self.have_minimum_size?(dimensions)
61
+ dimensions.first.to_i >= 300 && dimensions.last.to_i >= 150
62
+ end
63
+
64
+ def self.get_images(item_data, is_news)
65
+ image_urls = []
66
+ domain = URI.parse(item_data[:url] || item_data[:id])
67
+ [:og_image, :image].each do |key|
68
+ url = item_data[key]
69
+ if url
70
+ url = ensure_absolute(url.to_s, domain.to_s)
71
+ if have_minimum_size?(get_image_dimensions(url))
72
+ break if image_urls << url
73
+ end
74
+ end
75
+ end
76
+ nominated_images = if item_data[:lazy_image_tags].present?
77
+ item_data[:lazy_image_tags]
78
+ else
79
+ item_data[:images_in_text]
80
+ end
81
+ with_size = nominated_images.map do |url|
82
+ {url: url, dim: get_image_dimensions(url)}
83
+ end
84
+ largest_img = with_size.sort_by do |img|
85
+ dim = img[:dim]
86
+ dim ? dim[0] * dim[1] : 0
87
+ end.last
88
+ if largest_img
89
+ if is_news
90
+ image_urls << largest_img[:url]
91
+ else
92
+ image_urls.unshift(largest_img[:url]) if largest_img[:dim] && dimensions_ok?(largest_img[:dim])
93
+ end
94
+ end
95
+ image_urls
96
+ end
97
+
98
+ def self.get_videos(item_data)
99
+ item_data[:videos_in_text].map do |video|
100
+ video = "http:" + video if video[0..1] == "//" # sometimes the protocol is omitted from video url
101
+ video
102
+ end
103
+ end
104
+
105
+
106
+ def self.config
107
+ @@config ||= YAML.load(File.read(@@config_file))
108
+ end
109
+
110
+ def self.replacements
111
+ config[:replacements]
112
+ end
113
+
114
+ end
115
+ end
@@ -0,0 +1,61 @@
1
+ require 'open-uri'
2
+
3
+ module FeedProcessorUtils
4
+
5
+ class HTMLParser
6
+
7
+ @@default_config = File.join(File.dirname(__FILE__), "config/html_parser.yml")
8
+
9
+ def initialize(config_file = nil)
10
+ config_file ||= @@default_config
11
+ @config = YAML.load(File.read(config_file))
12
+ end
13
+
14
+ def parse_data(input)
15
+ input_doc = Nokogiri::HTML(input)
16
+ Hash[
17
+ fields.map do |field_name, parsing_data|
18
+ [field_name, extract_field(input_doc, parsing_data)]
19
+ end
20
+ ]
21
+ end
22
+
23
+ def parse_url(url)
24
+ input = open(url).read
25
+ parse_data(input)
26
+ end
27
+
28
+ private
29
+
30
+ def extract_field(input_doc, parsing_data)
31
+ if parsing_data[:collection]
32
+ collection = []
33
+ parsing_data[:selectors].each do |selector|
34
+ elements = input_doc.css(selector)
35
+ elements.each do |element|
36
+ if element[parsing_data[:attribute]]
37
+ collection << element[parsing_data[:attribute]]
38
+ elsif parsing_data[:fallback_text]
39
+ collection << element.text
40
+ end
41
+ end
42
+ end
43
+ collection
44
+ else
45
+ parsing_data[:selectors].each do |selector|
46
+ element = input_doc.at_css(selector)
47
+ if element
48
+ return element[parsing_data[:attribute]] if element[parsing_data[:attribute]]
49
+ return element.text if parsing_data[:fallback_text]
50
+ end
51
+ end
52
+ nil
53
+ end
54
+ end
55
+
56
+ def fields
57
+ @config
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,2 @@
1
+ require 'feed_processor_utils/feed_post_builder'
2
+ require 'feed_processor_utils/html_parser'
metadata ADDED
@@ -0,0 +1,98 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: feed_processor_utils
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - FTBpro
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-08-27 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: imagesize
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: newer_image_size
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ description: utility classes to work with feeds
63
+ email: gashaw@ftbpro.com
64
+ executables: []
65
+ extensions: []
66
+ extra_rdoc_files: []
67
+ files:
68
+ - lib/feed_processor_utils.rb
69
+ - lib/feed_processor_utils/feed_post_builder.rb
70
+ - lib/feed_processor_utils/html_parser.rb
71
+ - lib/feed_processor_utils/config/html_parser.yml
72
+ - lib/feed_processor_utils/config/feed_post_builder.yml
73
+ homepage: http://rubygems.org/gems/feed_processor_utils
74
+ licenses:
75
+ - MIT
76
+ post_install_message:
77
+ rdoc_options: []
78
+ require_paths:
79
+ - lib
80
+ required_ruby_version: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ required_rubygems_version: !ruby/object:Gem::Requirement
87
+ none: false
88
+ requirements:
89
+ - - ! '>='
90
+ - !ruby/object:Gem::Version
91
+ version: '0'
92
+ requirements: []
93
+ rubyforge_project:
94
+ rubygems_version: 1.8.24
95
+ signing_key:
96
+ specification_version: 3
97
+ summary: Feed Processing toolbox
98
+ test_files: []