fly_parser 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/fly_parser/enable_source.rb +41 -0
- data/lib/fly_parser/sources/news-fr.rb +39 -0
- data/lib/fly_parser.rb +5 -22
- metadata +3 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 04dc5a5ee6aefcaad08a66b4d86d418d2e2d6621
|
4
|
+
data.tar.gz: 8145b7beb20e9b7047c2422e3172a14b25218267
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4283c77559952bdcc2ed404e9b629c0263b806d9d0a776fed4d198674b6e92fb5601f65942fb9b29297fc7e2a42599ceba6740f2fcedcbce2bda0ec1432f9e0b
|
7
|
+
data.tar.gz: 37b9f632c4a943defa36e6d5d6a666ed3d3735379d8e0221b10afb881e64d4df096b92f9fac75d215a7cfe7cf833b502713feba5c298212385fe6daf2a21eed4
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# add new parser source here
|
2
|
+
module Enable
|
3
|
+
def fitness(source)
|
4
|
+
lambda do |item|
|
5
|
+
item["parser"] = Parser::Exercise.new(item["url"], source: source) and next if item["type"] == "exercises"
|
6
|
+
item["parser"] = Parser::Fitness.new(item["url"], source: source)
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
def news(source)
|
11
|
+
lambda { |item| item["parser"] = Parser::News.new(item["url"], source: source) }
|
12
|
+
end
|
13
|
+
|
14
|
+
def local(source)
|
15
|
+
lambda { |item| item["parser"] = Parser::News.new(item["file"], {type: :file, source: source}) }
|
16
|
+
end
|
17
|
+
|
18
|
+
def news_nl(source)
|
19
|
+
lambda { |item| item["parser"] = Parser::NewsNl.new(item["url"], source: source) }
|
20
|
+
end
|
21
|
+
|
22
|
+
def news_fr(source)
|
23
|
+
lambda {|item| item["parser"] = Parser::NewsFr.new(item["url"], source: source)}
|
24
|
+
end
|
25
|
+
|
26
|
+
def method_missing(meth, *args)
|
27
|
+
prefix = "enable_"
|
28
|
+
meth = meth.to_s
|
29
|
+
if meth.start_with?(prefix)
|
30
|
+
meth_name = meth.split(prefix).last
|
31
|
+
proc = send(meth_name, *args)
|
32
|
+
iterate_sources(*args, proc)
|
33
|
+
else
|
34
|
+
raise "Unknown method #{meth} in Enable class, ssory !"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def iterate_sources(source, block)
|
39
|
+
source["items"].each(&block)
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Parser
|
2
|
+
class NewsFr < XmlBase
|
3
|
+
def initialize(source, options = {})
|
4
|
+
super
|
5
|
+
end
|
6
|
+
|
7
|
+
def parse_all
|
8
|
+
items = @source.search('//item')
|
9
|
+
# # last_date = Time.now - 2.years # for dev 2 years
|
10
|
+
# # select! or reject! is not exists for Nokogiri#NodeSet
|
11
|
+
# # items = items.select {|item| item.xpath('pubDate').first.content() > last_date }
|
12
|
+
items.map do |item|
|
13
|
+
title = item.xpath('title/text()').text()
|
14
|
+
date = item.xpath('pubdate').first.content()
|
15
|
+
|
16
|
+
link = item.xpath('link/following-sibling::text()[1]').first
|
17
|
+
page = Nokogiri::HTML(open(link))
|
18
|
+
|
19
|
+
next if page.search('figure.img img').first.nil?
|
20
|
+
|
21
|
+
poster_image = page.search('.article-long figure.img img').first.attributes['src'].value
|
22
|
+
full_desc = page.search('.article-long .bd')
|
23
|
+
full_desc.search('.modification').remove()
|
24
|
+
full_desc.search('script').remove()
|
25
|
+
full_desc.search('.ft').remove()
|
26
|
+
full_desc.search('a').remove_attr('href')
|
27
|
+
full_desc.search('.twitter-tweet').remove()
|
28
|
+
|
29
|
+
desc = full_desc.inner_html
|
30
|
+
desc.gsub! /h2|h1|h3/, 'h4'
|
31
|
+
# remove href attributes
|
32
|
+
#full_desc = full_desc.text().gsub(/<a href="([a-zA-Z:\/\.\d\-]*)">(.*)<\/a>/,'<a>\2</a>')
|
33
|
+
copyright = "<p>Source: <a href='#{@copyright[:url]}'>#{@copyright[:title]}</a></p>"
|
34
|
+
content = desc + copyright
|
35
|
+
{title: title, content: content, poster_image: poster_image}
|
36
|
+
end.compact
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
data/lib/fly_parser.rb
CHANGED
@@ -34,6 +34,7 @@ end
|
|
34
34
|
|
35
35
|
module Parser
|
36
36
|
class << self
|
37
|
+
include Enable if defined? Enable
|
37
38
|
# Get HTTP Source
|
38
39
|
def http(url)
|
39
40
|
Nokogiri::HTML(open(url))
|
@@ -41,6 +42,7 @@ module Parser
|
|
41
42
|
|
42
43
|
def connect(url)
|
43
44
|
agent = Mechanize.new
|
45
|
+
agent.pluggable_parser.default = Mechanize::Page
|
44
46
|
agent.get(url)
|
45
47
|
end
|
46
48
|
|
@@ -93,29 +95,10 @@ module Parser
|
|
93
95
|
File.read(LOGO_PATH)
|
94
96
|
end
|
95
97
|
|
96
|
-
# choose parser for source here
|
97
98
|
def init_parser(source)
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
item["parser"] = Parser::Exercise.new(item["url"], source: source) and next if item["type"] == "exercises"
|
102
|
-
item["parser"] = Parser::Fitness.new(item["url"], source: source)
|
103
|
-
end
|
104
|
-
when "news"
|
105
|
-
source["items"].each do |item|
|
106
|
-
item["parser"] = Parser::News.new(item["url"], source: source)
|
107
|
-
end
|
108
|
-
when "local"
|
109
|
-
if source["enabled"]
|
110
|
-
source["items"].each do |item|
|
111
|
-
item["parser"] = Parser::News.new(item["file"], {type: :file, source: source})
|
112
|
-
end
|
113
|
-
end
|
114
|
-
when "news-nl"
|
115
|
-
source["items"].each do |item|
|
116
|
-
item["parser"] = Parser::NewsNl.new(item["url"], source: source)
|
117
|
-
end
|
118
|
-
end
|
99
|
+
source_type = source["source"].gsub('-', '_')
|
100
|
+
prefix = "enable_"
|
101
|
+
send(prefix + source_type, source)
|
119
102
|
end
|
120
103
|
|
121
104
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fly_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ruslan Korolev
|
@@ -118,11 +118,13 @@ files:
|
|
118
118
|
- lib/fly_parser.rb
|
119
119
|
- lib/fly_parser/base.rb
|
120
120
|
- lib/fly_parser/config_example.yml
|
121
|
+
- lib/fly_parser/enable_source.rb
|
121
122
|
- lib/fly_parser/logo.txt
|
122
123
|
- lib/fly_parser/mechanize_fix.rb
|
123
124
|
- lib/fly_parser/sources/astrology.rb
|
124
125
|
- lib/fly_parser/sources/exercise.rb
|
125
126
|
- lib/fly_parser/sources/fitness.rb
|
127
|
+
- lib/fly_parser/sources/news-fr.rb
|
126
128
|
- lib/fly_parser/sources/news-nl.rb
|
127
129
|
- lib/fly_parser/sources/news.rb
|
128
130
|
- lib/fly_parser/sources/sport.rb
|