fly_parser 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/fly_parser/sources/news-nl.rb +30 -0
- data/lib/fly_parser/sources/news.rb +2 -23
- data/lib/fly_parser/xml_base.rb +28 -0
- data/lib/fly_parser.rb +8 -7
- metadata +3 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a4b60890098babea5cdf0a19b2ab7f72207bce93
|
4
|
+
data.tar.gz: a5cf7fdba982c30f622f2c048ea5ef08d6a4565a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4d9bfcfbc33a83acbf053cca55206f27377a1f6ab62ead285def08fc43ef659ea61aee533c5c743b21ee06b8d7c22e79b77942370ce76ace69270e14f359f5fd
|
7
|
+
data.tar.gz: f4b199bf649c121cbf1883e00acf451742dd25830076bc3b55a62258e149d1c314026e447580a3c8e2eab3d42b6648e5b9adeb763056aa3bec10322c62d6c288
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Parser
|
2
|
+
class NewsNl < XmlBase
|
3
|
+
def initialize(source, options = {})
|
4
|
+
super
|
5
|
+
end
|
6
|
+
|
7
|
+
def parse_all
|
8
|
+
items = @source.search('//item')
|
9
|
+
# last_date = Time.now - 2.years # for dev 2 years
|
10
|
+
# select! or reject! is not exists for Nokogiri#NodeSet
|
11
|
+
# items = items.select {|item| item.xpath('pubDate').first.content() > last_date }
|
12
|
+
items.map do |item|
|
13
|
+
title = item.xpath('title/text()').text()
|
14
|
+
date = item.xpath('pubDate').first.content()
|
15
|
+
link = item.xpath('link/text()').text()
|
16
|
+
page = Nokogiri::HTML(open(link))
|
17
|
+
|
18
|
+
next if page.search('#article-image a img').first.nil?
|
19
|
+
poster_image = page.search('#article-image a img').first.attributes['src'].value
|
20
|
+
full_desc = item.xpath('description/text()')
|
21
|
+
# remove href attributes
|
22
|
+
full_desc = full_desc.text().gsub(/<a href="([a-zA-Z:\/\.\d\-]*)">(.*)<\/a>/,'<a>\2</a>')
|
23
|
+
|
24
|
+
copyright = "<p>Source: <a href='#{@copyright[:url]}'>#{@copyright[:title]}</a></p>"
|
25
|
+
content = full_desc + copyright
|
26
|
+
{title: title, content: content, poster_image: poster_image}
|
27
|
+
end.compact
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -1,28 +1,7 @@
|
|
1
1
|
module Parser
|
2
|
-
class News
|
2
|
+
class News < XmlBase
|
3
3
|
def initialize(source, options = {})
|
4
|
-
|
5
|
-
source = fake_url(source)
|
6
|
-
end
|
7
|
-
@copyright = copyright(options)
|
8
|
-
@source = Parser.connect(source)
|
9
|
-
@delay ||= 10
|
10
|
-
end
|
11
|
-
|
12
|
-
def fake_url(source)
|
13
|
-
stream = File.read(source)
|
14
|
-
# test_file.com is a random url, just for Mechanize parsing
|
15
|
-
url = "http://www.google.com"
|
16
|
-
FakeWeb.register_uri(:get, url, :body => stream, :content_type => "application/xml")
|
17
|
-
url
|
18
|
-
end
|
19
|
-
|
20
|
-
def copyright(options)
|
21
|
-
source = options[:source]
|
22
|
-
{
|
23
|
-
url: source['copyright'],
|
24
|
-
title: source['copyright_title']
|
25
|
-
}
|
4
|
+
super
|
26
5
|
end
|
27
6
|
|
28
7
|
def parse_all
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Parser
|
2
|
+
class XmlBase
|
3
|
+
def initialize(source, options = {})
|
4
|
+
if options[:type] == :file
|
5
|
+
source = fake_url(source)
|
6
|
+
end
|
7
|
+
@copyright = copyright(options)
|
8
|
+
@source = Parser.connect(source)
|
9
|
+
@delay ||= 10
|
10
|
+
end
|
11
|
+
|
12
|
+
def fake_url(source)
|
13
|
+
stream = File.read(source)
|
14
|
+
# test_file.com is a random url, just for Mechanize parsing
|
15
|
+
url = "http://www.google.com"
|
16
|
+
FakeWeb.register_uri(:get, url, :body => stream, :content_type => "application/xml")
|
17
|
+
url
|
18
|
+
end
|
19
|
+
|
20
|
+
def copyright(options)
|
21
|
+
source = options[:source]
|
22
|
+
{
|
23
|
+
url: source['copyright'],
|
24
|
+
title: source['copyright_title']
|
25
|
+
}
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
data/lib/fly_parser.rb
CHANGED
@@ -5,9 +5,8 @@ require 'pry'
|
|
5
5
|
require 'open-uri'
|
6
6
|
require 'yaml'
|
7
7
|
require 'mechanize'
|
8
|
-
BASE_PATH = File.expand_path("fly_parser
|
8
|
+
BASE_PATH = File.expand_path("fly_parser/*.rb", File.dirname(__FILE__))
|
9
9
|
LOGO_PATH = File.expand_path("fly_parser/logo.txt", File.dirname(__FILE__))
|
10
|
-
MECHANIZE_FIX = File.expand_path("fly_parser/mechanize_fix", File.dirname(__FILE__))
|
11
10
|
|
12
11
|
Pry.config.print = proc { |output, value| output.puts value.ai }
|
13
12
|
|
@@ -24,17 +23,15 @@ def require_all(path)
|
|
24
23
|
end
|
25
24
|
|
26
25
|
unless defined? Rails
|
27
|
-
|
26
|
+
Dir[BASE_PATH].each do |base_file|
|
27
|
+
require base_file
|
28
|
+
end
|
28
29
|
Dir.chdir RAILS_ROOT
|
29
30
|
require RAILS_BOOT_PATH
|
30
31
|
require RAILS_CONFIG_PATH
|
31
32
|
require_all 'fly_parser/sources'
|
32
33
|
end
|
33
34
|
|
34
|
-
# fix mechanize by monkey-patching :)
|
35
|
-
require MECHANIZE_FIX
|
36
|
-
|
37
|
-
|
38
35
|
module Parser
|
39
36
|
class << self
|
40
37
|
# Get HTTP Source
|
@@ -114,6 +111,10 @@ module Parser
|
|
114
111
|
item["parser"] = Parser::News.new(item["file"], {type: :file, source: source})
|
115
112
|
end
|
116
113
|
end
|
114
|
+
when "news-nl"
|
115
|
+
source["items"].each do |item|
|
116
|
+
item["parser"] = Parser::NewsNl.new(item["url"], source: source)
|
117
|
+
end
|
117
118
|
end
|
118
119
|
end
|
119
120
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fly_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ruslan Korolev
|
@@ -123,8 +123,10 @@ files:
|
|
123
123
|
- lib/fly_parser/sources/astrology.rb
|
124
124
|
- lib/fly_parser/sources/exercise.rb
|
125
125
|
- lib/fly_parser/sources/fitness.rb
|
126
|
+
- lib/fly_parser/sources/news-nl.rb
|
126
127
|
- lib/fly_parser/sources/news.rb
|
127
128
|
- lib/fly_parser/sources/sport.rb
|
129
|
+
- lib/fly_parser/xml_base.rb
|
128
130
|
homepage: http://rubygems.org
|
129
131
|
licenses:
|
130
132
|
- MIT
|