fly_parser 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/fly_parser/enable_source.rb +41 -0
- data/lib/fly_parser/sources/news-fr.rb +39 -0
- data/lib/fly_parser.rb +5 -22
- metadata +3 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 04dc5a5ee6aefcaad08a66b4d86d418d2e2d6621
|
4
|
+
data.tar.gz: 8145b7beb20e9b7047c2422e3172a14b25218267
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4283c77559952bdcc2ed404e9b629c0263b806d9d0a776fed4d198674b6e92fb5601f65942fb9b29297fc7e2a42599ceba6740f2fcedcbce2bda0ec1432f9e0b
|
7
|
+
data.tar.gz: 37b9f632c4a943defa36e6d5d6a666ed3d3735379d8e0221b10afb881e64d4df096b92f9fac75d215a7cfe7cf833b502713feba5c298212385fe6daf2a21eed4
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# add new parser source here
|
2
|
+
module Enable
|
3
|
+
def fitness(source)
|
4
|
+
lambda do |item|
|
5
|
+
item["parser"] = Parser::Exercise.new(item["url"], source: source) and next if item["type"] == "exercises"
|
6
|
+
item["parser"] = Parser::Fitness.new(item["url"], source: source)
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
def news(source)
|
11
|
+
lambda { |item| item["parser"] = Parser::News.new(item["url"], source: source) }
|
12
|
+
end
|
13
|
+
|
14
|
+
def local(source)
|
15
|
+
lambda { |item| item["parser"] = Parser::News.new(item["file"], {type: :file, source: source}) }
|
16
|
+
end
|
17
|
+
|
18
|
+
def news_nl(source)
|
19
|
+
lambda { |item| item["parser"] = Parser::NewsNl.new(item["url"], source: source) }
|
20
|
+
end
|
21
|
+
|
22
|
+
def news_fr(source)
|
23
|
+
lambda {|item| item["parser"] = Parser::NewsFr.new(item["url"], source: source)}
|
24
|
+
end
|
25
|
+
|
26
|
+
def method_missing(meth, *args)
|
27
|
+
prefix = "enable_"
|
28
|
+
meth = meth.to_s
|
29
|
+
if meth.start_with?(prefix)
|
30
|
+
meth_name = meth.split(prefix).last
|
31
|
+
proc = send(meth_name, *args)
|
32
|
+
iterate_sources(*args, proc)
|
33
|
+
else
|
34
|
+
raise "Unknown method #{meth} in Enable class, ssory !"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def iterate_sources(source, block)
|
39
|
+
source["items"].each(&block)
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Parser
|
2
|
+
class NewsFr < XmlBase
|
3
|
+
def initialize(source, options = {})
|
4
|
+
super
|
5
|
+
end
|
6
|
+
|
7
|
+
def parse_all
|
8
|
+
items = @source.search('//item')
|
9
|
+
# # last_date = Time.now - 2.years # for dev 2 years
|
10
|
+
# # select! or reject! is not exists for Nokogiri#NodeSet
|
11
|
+
# # items = items.select {|item| item.xpath('pubDate').first.content() > last_date }
|
12
|
+
items.map do |item|
|
13
|
+
title = item.xpath('title/text()').text()
|
14
|
+
date = item.xpath('pubdate').first.content()
|
15
|
+
|
16
|
+
link = item.xpath('link/following-sibling::text()[1]').first
|
17
|
+
page = Nokogiri::HTML(open(link))
|
18
|
+
|
19
|
+
next if page.search('figure.img img').first.nil?
|
20
|
+
|
21
|
+
poster_image = page.search('.article-long figure.img img').first.attributes['src'].value
|
22
|
+
full_desc = page.search('.article-long .bd')
|
23
|
+
full_desc.search('.modification').remove()
|
24
|
+
full_desc.search('script').remove()
|
25
|
+
full_desc.search('.ft').remove()
|
26
|
+
full_desc.search('a').remove_attr('href')
|
27
|
+
full_desc.search('.twitter-tweet').remove()
|
28
|
+
|
29
|
+
desc = full_desc.inner_html
|
30
|
+
desc.gsub! /h2|h1|h3/, 'h4'
|
31
|
+
# remove href attributes
|
32
|
+
#full_desc = full_desc.text().gsub(/<a href="([a-zA-Z:\/\.\d\-]*)">(.*)<\/a>/,'<a>\2</a>')
|
33
|
+
copyright = "<p>Source: <a href='#{@copyright[:url]}'>#{@copyright[:title]}</a></p>"
|
34
|
+
content = desc + copyright
|
35
|
+
{title: title, content: content, poster_image: poster_image}
|
36
|
+
end.compact
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/fly_parser.rb
CHANGED
@@ -34,6 +34,7 @@ end
|
|
34
34
|
|
35
35
|
module Parser
|
36
36
|
class << self
|
37
|
+
include Enable if defined? Enable
|
37
38
|
# Get HTTP Source
|
38
39
|
def http(url)
|
39
40
|
Nokogiri::HTML(open(url))
|
@@ -41,6 +42,7 @@ module Parser
|
|
41
42
|
|
42
43
|
def connect(url)
|
43
44
|
agent = Mechanize.new
|
45
|
+
agent.pluggable_parser.default = Mechanize::Page
|
44
46
|
agent.get(url)
|
45
47
|
end
|
46
48
|
|
@@ -93,29 +95,10 @@ module Parser
|
|
93
95
|
File.read(LOGO_PATH)
|
94
96
|
end
|
95
97
|
|
96
|
-
# choose parser for source here
|
97
98
|
def init_parser(source)
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
item["parser"] = Parser::Exercise.new(item["url"], source: source) and next if item["type"] == "exercises"
|
102
|
-
item["parser"] = Parser::Fitness.new(item["url"], source: source)
|
103
|
-
end
|
104
|
-
when "news"
|
105
|
-
source["items"].each do |item|
|
106
|
-
item["parser"] = Parser::News.new(item["url"], source: source)
|
107
|
-
end
|
108
|
-
when "local"
|
109
|
-
if source["enabled"]
|
110
|
-
source["items"].each do |item|
|
111
|
-
item["parser"] = Parser::News.new(item["file"], {type: :file, source: source})
|
112
|
-
end
|
113
|
-
end
|
114
|
-
when "news-nl"
|
115
|
-
source["items"].each do |item|
|
116
|
-
item["parser"] = Parser::NewsNl.new(item["url"], source: source)
|
117
|
-
end
|
118
|
-
end
|
99
|
+
source_type = source["source"].gsub('-', '_')
|
100
|
+
prefix = "enable_"
|
101
|
+
send(prefix + source_type, source)
|
119
102
|
end
|
120
103
|
|
121
104
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fly_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ruslan Korolev
|
@@ -118,11 +118,13 @@ files:
|
|
118
118
|
- lib/fly_parser.rb
|
119
119
|
- lib/fly_parser/base.rb
|
120
120
|
- lib/fly_parser/config_example.yml
|
121
|
+
- lib/fly_parser/enable_source.rb
|
121
122
|
- lib/fly_parser/logo.txt
|
122
123
|
- lib/fly_parser/mechanize_fix.rb
|
123
124
|
- lib/fly_parser/sources/astrology.rb
|
124
125
|
- lib/fly_parser/sources/exercise.rb
|
125
126
|
- lib/fly_parser/sources/fitness.rb
|
127
|
+
- lib/fly_parser/sources/news-fr.rb
|
126
128
|
- lib/fly_parser/sources/news-nl.rb
|
127
129
|
- lib/fly_parser/sources/news.rb
|
128
130
|
- lib/fly_parser/sources/sport.rb
|