fly_parser 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 95322be0f53123e2b8f9381e142e77a77556ce71
4
+ data.tar.gz: d04007e65f87581cfc2e4b364c03d48169132c1f
5
+ SHA512:
6
+ metadata.gz: 6c277897c67c70bf11a884fb1d3df66cea1d919ff8818c195fc0c7a25daac0ff31dc1b1453b6ba8fd0e31ccc2e2911cf8207d9dad1f9f6b460489a10a4f1bc1d
7
+ data.tar.gz: e0b89b476a24b764bd684b186dd750387831989316a86758b93f75b5ec2a301182dcc8c01e0c35e57efb0e30e6a9a72160e59ccdd501567685ac75868804f2ee
@@ -0,0 +1,46 @@
1
+ module Parser
2
+ # Html Base
3
+ class Base
4
+
5
+ def initialize(url, options)
6
+ @source = Parser.connect(url)
7
+ @copyright = copyright(options)
8
+ @limit_pages ||= 5
9
+ @delay ||= 10
10
+ end
11
+
12
+ def next_page(css_selector)
13
+ @next_page = @source.links_with(css_selector)[0]
14
+ end
15
+
16
+ def parse_all
17
+ result = parse_page
18
+ next_page()
19
+ #concat all pages into one array
20
+ i = 1
21
+ ap "Parsing #{i} page"
22
+ until @next_page.nil? || i == @limit_pages
23
+ sleep @delay
24
+ i += 1
25
+ ap "Parsing #{i} page"
26
+ @source = @next_page.click
27
+ next_page()
28
+ result.concat(parse_page)
29
+ end
30
+ result
31
+ end
32
+ def parse_page
33
+ end
34
+ def collect_between(first, last)
35
+ return nil if first.nil?
36
+ first == last ? [first] : [first, *collect_between(first.next, last)]
37
+ end
38
+ def copyright(options)
39
+ source = options[:source]
40
+ {
41
+ url: source['copyright'],
42
+ title: source['copyright_title']
43
+ }
44
+ end
45
+ end
46
+ end
File without changes
@@ -0,0 +1,9 @@
1
+ .---. .-----------
2
+ / \ __ / ------
3
+ / / \( )/ -----
4
+ ////// ' \/ ` ---
5
+ //// / // : : ---
6
+ // / / /` '--
7
+ // //..\\
8
+ =============UU====UU====
9
+ '//||\\` Fly Group Inc.
@@ -0,0 +1,53 @@
1
+ class Mechanize::HTTP::Agent
2
+ MAX_RESET_RETRIES = 10
3
+
4
+ # We need to replace the core Mechanize HTTP method:
5
+ #
6
+ # Mechanize::HTTP::Agent#fetch
7
+ #
8
+ # with a wrapper that handles the infamous "too many connection resets"
9
+ # Mechanize bug that is described here:
10
+ #
11
+ # https://github.com/sparklemotion/mechanize/issues/123
12
+ #
13
+ # The wrapper shuts down the persistent HTTP connection when it fails with
14
+ # this error, and simply tries again. In practice, this only ever needs to
15
+ # be retried once, but I am going to let it retry a few times
16
+ # (MAX_RESET_RETRIES), just in case.
17
+ #
18
+ def fetch_with_retry(
19
+ uri,
20
+ method = :get,
21
+ headers = {},
22
+ params = [],
23
+ referer = current_page,
24
+ redirects = 0
25
+ )
26
+ action = "#{method.to_s.upcase} #{uri.to_s}"
27
+ retry_count = 0
28
+
29
+ begin
30
+ fetch_without_retry(uri, method, headers, params, referer, redirects)
31
+ rescue Net::HTTP::Persistent::Error => e
32
+ # Pass on any other type of error.
33
+ raise unless e.message =~ /too many connection resets/
34
+
35
+ # Pass on the error if we've tried too many times.
36
+ if retry_count >= MAX_RESET_RETRIES
37
+ puts "**** WARN: Mechanize retried connection reset #{MAX_RESET_RETRIES} times and never succeeded: #{action}"
38
+ raise
39
+ end
40
+
41
+ # Otherwise, shutdown the persistent HTTP connection and try again.
42
+ puts "**** WARN: Mechanize retrying connection reset error: #{action}"
43
+ retry_count += 1
44
+ self.http.shutdown
45
+ retry
46
+ end
47
+ end
48
+
49
+ # Alias so #fetch actually uses our new #fetch_with_retry to wrap the
50
+ # old one aliased as #fetch_without_retry.
51
+ alias_method :fetch_without_retry, :fetch
52
+ alias_method :fetch, :fetch_with_retry
53
+ end
@@ -0,0 +1,67 @@
1
+ require 'hashie'
2
+ module Parser
3
+ class Astrology
4
+
5
+ def initialize(source)
6
+ @zodiacs = ['Овен','Телец','Близнецы','Рак','Лев','Дева','Весы','Скорпион','Стрелец','Козерог','Водолей','Рыбы']
7
+ @source = Parser.connect(source)
8
+
9
+ small_titles = ["Гороскоп на сегодня", "Гороскоп на завтра", "Гороскоп на неделю"]
10
+ big_titles = ["Гороскопы на месяц", "Гороскоп на 2014 год", "Гороскоп на 2014 год зеленой Лошади", "Гороскоп на сентябрь 2014"]
11
+ @titles = Hashie::Mash.new
12
+ @titles.small = small_titles
13
+ @titles.big = big_titles
14
+ end
15
+
16
+ def parse_in(text = "Гороскоп на сегодня", date = 'small')
17
+ @text = text
18
+ @date = date
19
+ parse_content
20
+ end
21
+
22
+ def parse_content
23
+ zodiac_links.map do |item|
24
+ link = item.link
25
+ zodiac = item.zodiac
26
+ @page = Parser.http(link.value)
27
+ content = (@date == 'small' ? parse_small : parse_big)
28
+ result = Hashie::Mash.new
29
+ result.zodiac = zodiac
30
+ result.content = content
31
+ result
32
+ end
33
+ end
34
+
35
+ def parse_all
36
+ small_content = @titles.small.map { |title| {title: title, content: parse_in(title,"small")} }
37
+ big_content = @titles.big.map { |title| {title: title, content: parse_in(title,"big")} }
38
+
39
+ small_content.concat big_content
40
+ end
41
+
42
+ private
43
+
44
+ def current_page
45
+ @source.link_with(:text => @text).click
46
+ end
47
+
48
+ def zodiac_links
49
+ @zodiacs.map do |z|
50
+ result = Hashie::Mash.new
51
+ result.link = current_page.search("a:contains('#{z}')")[0].attributes["href"]
52
+ result.zodiac = z
53
+ result
54
+ end
55
+ end
56
+
57
+ def parse_small
58
+ @page.css('#main').children().reject do |el|
59
+ (el.attributes['class'].value == 'lp50' || el.attributes['class'].value == 'rp50' || el.attributes['class'].value == "space" if el.attributes['class'] != nil) || ['img','br','b','h1'].include?(el.name) || ["\n","\n\n"].include?(el.text)
60
+ end.join
61
+ end
62
+
63
+ def parse_big
64
+ @page.css('#main .lp50').text()
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,33 @@
1
+ module Parser
2
+ class Exercise < Base
3
+
4
+ def initialize(url, options)
5
+ @delay = 5
6
+ super
7
+ end
8
+ def next_page
9
+ super(:class => 'nextpostslink')
10
+ end
11
+
12
+ def parse_page
13
+ #until next page exists
14
+ links = @source.links_with(:class => 'readmore')
15
+ links.map do |link|
16
+ page = link.click
17
+ article = page.search('.post')
18
+ title = article.search('.title').text()
19
+ wrapper = article.search('.entry')
20
+ wrapper.search('a').remove_attr('href')
21
+ poster_image = wrapper.search('img').first.attributes['src'].value
22
+ start_element = wrapper.at('p:nth-child(2)')
23
+ end_element = wrapper.xpath("//comment()[. = ' adman_adcode_after ']").first
24
+ next if start_element.next.nil?
25
+ content = collect_between(start_element,end_element).map(&:to_s).join
26
+ next if content.nil?
27
+ {title: title, content: content, poster_image: poster_image}
28
+ end.compact
29
+
30
+ end
31
+
32
+ end
33
+ end
@@ -0,0 +1,36 @@
1
+ # todo parse all pages first, and download only newest later
2
+ module Parser
3
+ class Fitness < Base
4
+
5
+ def next_page
6
+ super(:id => 'next_page')
7
+ end
8
+
9
+ def parse_page
10
+ # until next page exists
11
+ links = @source.links_with(:class => 'article-headine__link')
12
+ links.map do |link|
13
+ href = link.href
14
+ page = link.click
15
+ article = page.search('div[itemscope]')
16
+ title = article.search('.article-name').text()
17
+ # images stuff
18
+ wrapper = article.search('.article-text__wrapper')
19
+ # if poster_image doesn't exists, it looks like strange promo ads, so skip it
20
+ next if wrapper.search('.article_image__pic').first.nil?
21
+ poster_image = @copyright[:url] + wrapper.search('.article_image__pic').first.attributes['src'].value
22
+ wrapper.search('.article_image__pic').first.remove()
23
+ wrapper.search('.article_image__pic')
24
+ images = wrapper.search('.article_image__pic')
25
+ # expand path for images
26
+ images.map {|image| image.attributes['src'].value = image.attributes['src'].value.prepend(@copyright[:url]); image }
27
+ # remove ad
28
+ wrapper.search('.po_theme').remove()
29
+ wrapper.search('a').remove_attr('href')
30
+
31
+ {title: title, content: wrapper.inner_html, poster_image: poster_image}
32
+ end.compact
33
+ end
34
+
35
+ end
36
+ end
@@ -0,0 +1,53 @@
1
+ module Parser
2
+ class News
3
+ def initialize(source, options = {})
4
+ if options[:type] == :file
5
+ source = fake_url(source)
6
+ end
7
+ @copyright = copyright(options)
8
+ @source = Parser.connect(source)
9
+ @delay ||= 10
10
+ end
11
+
12
+ def fake_url(source)
13
+ stream = File.read(source)
14
+ # test_file.com is a random url, just for Mechanize parsing
15
+ url = "http://www.google.com"
16
+ FakeWeb.register_uri(:get, url, :body => stream, :content_type => "application/xml")
17
+ url
18
+ end
19
+
20
+ def copyright(options)
21
+ source = options[:source]
22
+ {
23
+ url: source['copyright'],
24
+ title: source['copyright_title']
25
+ }
26
+ end
27
+
28
+ def parse_all
29
+ items = @source.search('//item')
30
+ last_date = Time.now - 2.years # for dev 2 years
31
+ # select! or reject! is not exists for Nokogiri#NodeSet
32
+ items = items.select {|item| item.xpath('pubDate').first.content() > last_date }
33
+ items.map do |item|
34
+ title = item.xpath('title/text()').text()
35
+ date = item.xpath('pubDate').first.content()
36
+ link = item.xpath('link/text()').text()
37
+ page = Nokogiri::HTML(open(link))
38
+ next if page.search('.article_illustration img').first.nil?
39
+ poster_image = page.search('.article_illustration img').first.attributes['src'].value
40
+ short_desc = page.search('.article_lead').first().content()
41
+
42
+ full_desc = page.search('.article_full_text')
43
+ full_desc.search('.article_illustration').remove()
44
+ full_desc.search('.inject-data').remove()
45
+ full_desc.search('a').remove()
46
+
47
+ copyright = "<p>Источник: <a href='#{@copyright[:url]}'>#{@copyright[:title]}</a></p>"
48
+ content = "<p>#{short_desc}</p>" + full_desc.inner_html + copyright
49
+ {title: title, content: content, poster_image: poster_image}
50
+ end.compact
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,27 @@
1
+ module Parser
2
+ class Sport
3
+
4
+ def initialize(source)
5
+ @source = Parser.connect(source)
6
+ @categories = %w(Футбол Хоккей Баскетбол Авто/мото Теннис)
7
+ end
8
+
9
+ def parse_in(category)
10
+ @category = category
11
+ @current_page = current_page
12
+ @current_page.search('.anons').map do |news_item|
13
+ href = news_item.css('a').first.attributes['href'].value
14
+ item_page = @current_page.link_with(:href => href).click
15
+ { title: item_page.search('.titleH1').text(), content: item_page.search('.article-textBlock').text() }
16
+ end
17
+ end
18
+
19
+ def current_page
20
+ @source.link_with(:text => @category).click
21
+ end
22
+
23
+ def parse_all
24
+ @categories.map { |category| parse_in(category) }
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,3 @@
1
+ module Parser
2
+ VERSION = '0.0.1'
3
+ end
data/lib/fly_parser.rb ADDED
@@ -0,0 +1,122 @@
1
+ # Little library for parsing articles
2
+ # chmod +x lib/run_parser.rb
3
+ # run from Rails root bundle exec lib/run_parser.rb
4
+ require 'pry'
5
+ require 'open-uri'
6
+ require 'yaml'
7
+
8
+ BASE_PATH = File.expand_path("fly_parser/base", File.dirname(__FILE__))
9
+ LOGO_PATH = File.expand_path("fly_parser/logo.txt", File.dirname(__FILE__))
10
+ MECHANIZE_FIX = File.expand_path("fly_parser/mechanize_fix", File.dirname(__FILE__))
11
+
12
+ require BASE_PATH
13
+ Dir.chdir RAILS_ROOT
14
+ require RAILS_BOOT_PATH
15
+ require RAILS_CONFIG_PATH
16
+ Pry.config.print = proc { |output, value| output.puts value.ai }
17
+
18
+ # Require all of the Ruby files in the given directory.
19
+ #
20
+ # path - The String relative path from here to the directory.
21
+ #
22
+ # Returns nothing.
23
+ def require_all(path)
24
+ glob = File.expand_path(File.join(File.dirname(__FILE__), path, '**', '*.rb'))
25
+ Dir[glob].each do |f|
26
+ require f
27
+ end
28
+ end
29
+ require_all 'fly_parser/sources'
30
+
31
+ # fix mechanize by monkey-patching :)
32
+ require MECHANIZE_FIX
33
+
34
+ module Parser
35
+ class << self
36
+ # Get HTTP Source
37
+ def http(url)
38
+ Nokogiri::HTML(open(url))
39
+ end
40
+
41
+ def connect(url)
42
+ agent = Mechanize.new
43
+ agent.get(url)
44
+ end
45
+
46
+ def save(articles, options)
47
+ articles.each do |article|
48
+ item = Article.new(title: article[:title], content: article[:content])
49
+ item.categories = [Category.find(options[:category_id])]
50
+ item.remote_image_url = article[:poster_image]
51
+ item.save!
52
+ end
53
+ end
54
+
55
+ def start
56
+ puts logo
57
+
58
+ source = find_source
59
+
60
+ init_parser(source)
61
+ parse_and_save(source["items"])
62
+ end
63
+
64
+ def parse_and_save(items)
65
+ items.each do |item|
66
+ ap "Parsing #{item['type']}"
67
+ result = parse_item(item)
68
+ save_item(item, result)
69
+ end
70
+ end
71
+
72
+ def save_item(item, result)
73
+ category = Category.find_or_create_by!(name: JSON.generate(en: item["category"]))
74
+ ap "and save to #{category.localized_name} category"
75
+ Parser.save result, {category_id: category.id}
76
+ end
77
+
78
+ def parse_item(item)
79
+ item["parser"].parse_all
80
+ end
81
+
82
+ def config
83
+ YAML.load_file(CONFIG_PATH)
84
+ end
85
+
86
+ def find_source
87
+ config["sources"].find {|source| source["enabled"] }
88
+ end
89
+
90
+ def logo
91
+ File.read(LOGO_PATH)
92
+ end
93
+
94
+ # choose parser for source here
95
+ def init_parser(source)
96
+ case source["source"]
97
+ when "fitness"
98
+ source["items"].each do |item|
99
+ item["parser"] = Parser::Exercise.new(item["url"], source: source) and next if item["type"] == "exercises"
100
+ item["parser"] = Parser::Fitness.new(item["url"], source: source)
101
+ end
102
+ when "news"
103
+ source["items"].each do |item|
104
+ item["parser"] = Parser::News.new(item["url"], source: source)
105
+ end
106
+ when "local"
107
+ if source["enabled"]
108
+ source["items"].each do |item|
109
+ item["parser"] = Parser::News.new(item["file"], {type: :file, source: source})
110
+ end
111
+ end
112
+ end
113
+ end
114
+
115
+ end
116
+
117
+ end
118
+
119
+ # astrology
120
+ # astro = Parser::Astrology.new('http://moj-znak-zodiaka.ru/')
121
+ # astro.parse_in("Гороскоп на сентябрь 2014","big")
122
+ # astro.parse_all
metadata ADDED
@@ -0,0 +1,154 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fly_parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Ruslan Korolev
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-10-25 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rails
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 4.1.4
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 4.1.4
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 1.6.3.1
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: 1.6.3.1
41
+ - !ruby/object:Gem::Dependency
42
+ name: mechanize
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: hashie
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: fakeweb
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: awesome_print
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: pry
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description: Simple fly parser for internal uses
112
+ email:
113
+ - rusik3@gmail.com
114
+ executables: []
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - lib/fly_parser.rb
119
+ - lib/fly_parser/base.rb
120
+ - lib/fly_parser/config_example.yml
121
+ - lib/fly_parser/logo.txt
122
+ - lib/fly_parser/mechanize_fix.rb
123
+ - lib/fly_parser/sources/astrology.rb
124
+ - lib/fly_parser/sources/exercise.rb
125
+ - lib/fly_parser/sources/fitness.rb
126
+ - lib/fly_parser/sources/news.rb
127
+ - lib/fly_parser/sources/sport.rb
128
+ - lib/fly_parser/version.rb
129
+ homepage: http://rubygems.org
130
+ licenses:
131
+ - MIT
132
+ metadata: {}
133
+ post_install_message:
134
+ rdoc_options: []
135
+ require_paths:
136
+ - lib
137
+ required_ruby_version: !ruby/object:Gem::Requirement
138
+ requirements:
139
+ - - '>='
140
+ - !ruby/object:Gem::Version
141
+ version: 2.0.0
142
+ required_rubygems_version: !ruby/object:Gem::Requirement
143
+ requirements:
144
+ - - '>='
145
+ - !ruby/object:Gem::Version
146
+ version: '0'
147
+ requirements: []
148
+ rubyforge_project:
149
+ rubygems_version: 2.4.1
150
+ signing_key:
151
+ specification_version: 4
152
+ summary: Fly parser
153
+ test_files: []
154
+ has_rdoc: