fly_parser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 95322be0f53123e2b8f9381e142e77a77556ce71
4
+ data.tar.gz: d04007e65f87581cfc2e4b364c03d48169132c1f
5
+ SHA512:
6
+ metadata.gz: 6c277897c67c70bf11a884fb1d3df66cea1d919ff8818c195fc0c7a25daac0ff31dc1b1453b6ba8fd0e31ccc2e2911cf8207d9dad1f9f6b460489a10a4f1bc1d
7
+ data.tar.gz: e0b89b476a24b764bd684b186dd750387831989316a86758b93f75b5ec2a301182dcc8c01e0c35e57efb0e30e6a9a72160e59ccdd501567685ac75868804f2ee
@@ -0,0 +1,46 @@
1
+ module Parser
2
+ # Html Base
3
+ class Base
4
+
5
+ def initialize(url, options)
6
+ @source = Parser.connect(url)
7
+ @copyright = copyright(options)
8
+ @limit_pages ||= 5
9
+ @delay ||= 10
10
+ end
11
+
12
+ def next_page(css_selector)
13
+ @next_page = @source.links_with(css_selector)[0]
14
+ end
15
+
16
+ def parse_all
17
+ result = parse_page
18
+ next_page()
19
+ #concat all pages into one array
20
+ i = 1
21
+ ap "Parsing #{i} page"
22
+ until @next_page.nil? || i == @limit_pages
23
+ sleep @delay
24
+ i += 1
25
+ ap "Parsing #{i} page"
26
+ @source = @next_page.click
27
+ next_page()
28
+ result.concat(parse_page)
29
+ end
30
+ result
31
+ end
32
+ def parse_page
33
+ end
34
+ def collect_between(first, last)
35
+ return nil if first.nil?
36
+ first == last ? [first] : [first, *collect_between(first.next, last)]
37
+ end
38
+ def copyright(options)
39
+ source = options[:source]
40
+ {
41
+ url: source['copyright'],
42
+ title: source['copyright_title']
43
+ }
44
+ end
45
+ end
46
+ end
File without changes
@@ -0,0 +1,9 @@
1
+ .---. .-----------
2
+ / \ __ / ------
3
+ / / \( )/ -----
4
+ ////// ' \/ ` ---
5
+ //// / // : : ---
6
+ // / / /` '--
7
+ // //..\\
8
+ =============UU====UU====
9
+ '//||\\` Fly Group Inc.
@@ -0,0 +1,53 @@
1
+ class Mechanize::HTTP::Agent
2
+ MAX_RESET_RETRIES = 10
3
+
4
+ # We need to replace the core Mechanize HTTP method:
5
+ #
6
+ # Mechanize::HTTP::Agent#fetch
7
+ #
8
+ # with a wrapper that handles the infamous "too many connection resets"
9
+ # Mechanize bug that is described here:
10
+ #
11
+ # https://github.com/sparklemotion/mechanize/issues/123
12
+ #
13
+ # The wrapper shuts down the persistent HTTP connection when it fails with
14
+ # this error, and simply tries again. In practice, this only ever needs to
15
+ # be retried once, but I am going to let it retry a few times
16
+ # (MAX_RESET_RETRIES), just in case.
17
+ #
18
+ def fetch_with_retry(
19
+ uri,
20
+ method = :get,
21
+ headers = {},
22
+ params = [],
23
+ referer = current_page,
24
+ redirects = 0
25
+ )
26
+ action = "#{method.to_s.upcase} #{uri.to_s}"
27
+ retry_count = 0
28
+
29
+ begin
30
+ fetch_without_retry(uri, method, headers, params, referer, redirects)
31
+ rescue Net::HTTP::Persistent::Error => e
32
+ # Pass on any other type of error.
33
+ raise unless e.message =~ /too many connection resets/
34
+
35
+ # Pass on the error if we've tried too many times.
36
+ if retry_count >= MAX_RESET_RETRIES
37
+ puts "**** WARN: Mechanize retried connection reset #{MAX_RESET_RETRIES} times and never succeeded: #{action}"
38
+ raise
39
+ end
40
+
41
+ # Otherwise, shutdown the persistent HTTP connection and try again.
42
+ puts "**** WARN: Mechanize retrying connection reset error: #{action}"
43
+ retry_count += 1
44
+ self.http.shutdown
45
+ retry
46
+ end
47
+ end
48
+
49
+ # Alias so #fetch actually uses our new #fetch_with_retry to wrap the
50
+ # old one aliased as #fetch_without_retry.
51
+ alias_method :fetch_without_retry, :fetch
52
+ alias_method :fetch, :fetch_with_retry
53
+ end
@@ -0,0 +1,67 @@
1
+ require 'hashie'
2
+ module Parser
3
+ class Astrology
4
+
5
+ def initialize(source)
6
+ @zodiacs = ['Овен','Телец','Близнецы','Рак','Лев','Дева','Весы','Скорпион','Стрелец','Козерог','Водолей','Рыбы']
7
+ @source = Parser.connect(source)
8
+
9
+ small_titles = ["Гороскоп на сегодня", "Гороскоп на завтра", "Гороскоп на неделю"]
10
+ big_titles = ["Гороскопы на месяц", "Гороскоп на 2014 год", "Гороскоп на 2014 год зеленой Лошади", "Гороскоп на сентябрь 2014"]
11
+ @titles = Hashie::Mash.new
12
+ @titles.small = small_titles
13
+ @titles.big = big_titles
14
+ end
15
+
16
+ def parse_in(text = "Гороскоп на сегодня", date = 'small')
17
+ @text = text
18
+ @date = date
19
+ parse_content
20
+ end
21
+
22
+ def parse_content
23
+ zodiac_links.map do |item|
24
+ link = item.link
25
+ zodiac = item.zodiac
26
+ @page = Parser.http(link.value)
27
+ content = (@date == 'small' ? parse_small : parse_big)
28
+ result = Hashie::Mash.new
29
+ result.zodiac = zodiac
30
+ result.content = content
31
+ result
32
+ end
33
+ end
34
+
35
+ def parse_all
36
+ small_content = @titles.small.map { |title| {title: title, content: parse_in(title,"small")} }
37
+ big_content = @titles.big.map { |title| {title: title, content: parse_in(title,"big")} }
38
+
39
+ small_content.concat big_content
40
+ end
41
+
42
+ private
43
+
44
+ def current_page
45
+ @source.link_with(:text => @text).click
46
+ end
47
+
48
+ def zodiac_links
49
+ @zodiacs.map do |z|
50
+ result = Hashie::Mash.new
51
+ result.link = current_page.search("a:contains('#{z}')")[0].attributes["href"]
52
+ result.zodiac = z
53
+ result
54
+ end
55
+ end
56
+
57
+ def parse_small
58
+ @page.css('#main').children().reject do |el|
59
+ (el.attributes['class'].value == 'lp50' || el.attributes['class'].value == 'rp50' || el.attributes['class'].value == "space" if el.attributes['class'] != nil) || ['img','br','b','h1'].include?(el.name) || ["\n","\n\n"].include?(el.text)
60
+ end.join
61
+ end
62
+
63
+ def parse_big
64
+ @page.css('#main .lp50').text()
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,33 @@
1
+ module Parser
2
+ class Exercise < Base
3
+
4
+ def initialize(url, options)
5
+ @delay = 5
6
+ super
7
+ end
8
+ def next_page
9
+ super(:class => 'nextpostslink')
10
+ end
11
+
12
+ def parse_page
13
+ #until next page exists
14
+ links = @source.links_with(:class => 'readmore')
15
+ links.map do |link|
16
+ page = link.click
17
+ article = page.search('.post')
18
+ title = article.search('.title').text()
19
+ wrapper = article.search('.entry')
20
+ wrapper.search('a').remove_attr('href')
21
+ poster_image = wrapper.search('img').first.attributes['src'].value
22
+ start_element = wrapper.at('p:nth-child(2)')
23
+ end_element = wrapper.xpath("//comment()[. = ' adman_adcode_after ']").first
24
+ next if start_element.next.nil?
25
+ content = collect_between(start_element,end_element).map(&:to_s).join
26
+ next if content.nil?
27
+ {title: title, content: content, poster_image: poster_image}
28
+ end.compact
29
+
30
+ end
31
+
32
+ end
33
+ end
@@ -0,0 +1,36 @@
1
+ # todo parse all pages first, and download only newest later
2
+ module Parser
3
+ class Fitness < Base
4
+
5
+ def next_page
6
+ super(:id => 'next_page')
7
+ end
8
+
9
+ def parse_page
10
+ # until next page exists
11
+ links = @source.links_with(:class => 'article-headine__link')
12
+ links.map do |link|
13
+ href = link.href
14
+ page = link.click
15
+ article = page.search('div[itemscope]')
16
+ title = article.search('.article-name').text()
17
+ # images stuff
18
+ wrapper = article.search('.article-text__wrapper')
19
+ # if poster_image doesn't exists, it looks like strange promo ads, so skip it
20
+ next if wrapper.search('.article_image__pic').first.nil?
21
+ poster_image = @copyright[:url] + wrapper.search('.article_image__pic').first.attributes['src'].value
22
+ wrapper.search('.article_image__pic').first.remove()
23
+ wrapper.search('.article_image__pic')
24
+ images = wrapper.search('.article_image__pic')
25
+ # expand path for images
26
+ images.map {|image| image.attributes['src'].value = image.attributes['src'].value.prepend(@copyright[:url]); image }
27
+ # remove ad
28
+ wrapper.search('.po_theme').remove()
29
+ wrapper.search('a').remove_attr('href')
30
+
31
+ {title: title, content: wrapper.inner_html, poster_image: poster_image}
32
+ end.compact
33
+ end
34
+
35
+ end
36
+ end
@@ -0,0 +1,53 @@
1
+ module Parser
2
+ class News
3
+ def initialize(source, options = {})
4
+ if options[:type] == :file
5
+ source = fake_url(source)
6
+ end
7
+ @copyright = copyright(options)
8
+ @source = Parser.connect(source)
9
+ @delay ||= 10
10
+ end
11
+
12
+ def fake_url(source)
13
+ stream = File.read(source)
14
+ # test_file.com is a random url, just for Mechanize parsing
15
+ url = "http://www.google.com"
16
+ FakeWeb.register_uri(:get, url, :body => stream, :content_type => "application/xml")
17
+ url
18
+ end
19
+
20
+ def copyright(options)
21
+ source = options[:source]
22
+ {
23
+ url: source['copyright'],
24
+ title: source['copyright_title']
25
+ }
26
+ end
27
+
28
+ def parse_all
29
+ items = @source.search('//item')
30
+ last_date = Time.now - 2.years # for dev 2 years
31
+ # select! or reject! is not exists for Nokogiri#NodeSet
32
+ items = items.select {|item| item.xpath('pubDate').first.content() > last_date }
33
+ items.map do |item|
34
+ title = item.xpath('title/text()').text()
35
+ date = item.xpath('pubDate').first.content()
36
+ link = item.xpath('link/text()').text()
37
+ page = Nokogiri::HTML(open(link))
38
+ next if page.search('.article_illustration img').first.nil?
39
+ poster_image = page.search('.article_illustration img').first.attributes['src'].value
40
+ short_desc = page.search('.article_lead').first().content()
41
+
42
+ full_desc = page.search('.article_full_text')
43
+ full_desc.search('.article_illustration').remove()
44
+ full_desc.search('.inject-data').remove()
45
+ full_desc.search('a').remove()
46
+
47
+ copyright = "<p>Источник: <a href='#{@copyright[:url]}'>#{@copyright[:title]}</a></p>"
48
+ content = "<p>#{short_desc}</p>" + full_desc.inner_html + copyright
49
+ {title: title, content: content, poster_image: poster_image}
50
+ end.compact
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,27 @@
1
+ module Parser
2
+ class Sport
3
+
4
+ def initialize(source)
5
+ @source = Parser.connect(source)
6
+ @categories = %w(Футбол Хоккей Баскетбол Авто/мото Теннис)
7
+ end
8
+
9
+ def parse_in(category)
10
+ @category = category
11
+ @current_page = current_page
12
+ @current_page.search('.anons').map do |news_item|
13
+ href = news_item.css('a').first.attributes['href'].value
14
+ item_page = @current_page.link_with(:href => href).click
15
+ { title: item_page.search('.titleH1').text(), content: item_page.search('.article-textBlock').text() }
16
+ end
17
+ end
18
+
19
+ def current_page
20
+ @source.link_with(:text => @category).click
21
+ end
22
+
23
+ def parse_all
24
+ @categories.map { |category| parse_in(category) }
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,3 @@
1
+ module Parser
2
+ VERSION = '0.0.1'
3
+ end
data/lib/fly_parser.rb ADDED
@@ -0,0 +1,122 @@
1
+ # Little library for parsing articles
2
+ # chmod +x lib/run_parser.rb
3
+ # run from Rails root bundle exec lib/run_parser.rb
4
+ require 'pry'
5
+ require 'open-uri'
6
+ require 'yaml'
7
+
8
+ BASE_PATH = File.expand_path("fly_parser/base", File.dirname(__FILE__))
9
+ LOGO_PATH = File.expand_path("fly_parser/logo.txt", File.dirname(__FILE__))
10
+ MECHANIZE_FIX = File.expand_path("fly_parser/mechanize_fix", File.dirname(__FILE__))
11
+
12
+ require BASE_PATH
13
+ Dir.chdir RAILS_ROOT
14
+ require RAILS_BOOT_PATH
15
+ require RAILS_CONFIG_PATH
16
+ Pry.config.print = proc { |output, value| output.puts value.ai }
17
+
18
+ # Require all of the Ruby files in the given directory.
19
+ #
20
+ # path - The String relative path from here to the directory.
21
+ #
22
+ # Returns nothing.
23
+ def require_all(path)
24
+ glob = File.expand_path(File.join(File.dirname(__FILE__), path, '**', '*.rb'))
25
+ Dir[glob].each do |f|
26
+ require f
27
+ end
28
+ end
29
+ require_all 'fly_parser/sources'
30
+
31
+ # fix mechanize by monkey-patching :)
32
+ require MECHANIZE_FIX
33
+
34
+ module Parser
35
+ class << self
36
+ # Get HTTP Source
37
+ def http(url)
38
+ Nokogiri::HTML(open(url))
39
+ end
40
+
41
+ def connect(url)
42
+ agent = Mechanize.new
43
+ agent.get(url)
44
+ end
45
+
46
+ def save(articles, options)
47
+ articles.each do |article|
48
+ item = Article.new(title: article[:title], content: article[:content])
49
+ item.categories = [Category.find(options[:category_id])]
50
+ item.remote_image_url = article[:poster_image]
51
+ item.save!
52
+ end
53
+ end
54
+
55
+ def start
56
+ puts logo
57
+
58
+ source = find_source
59
+
60
+ init_parser(source)
61
+ parse_and_save(source["items"])
62
+ end
63
+
64
+ def parse_and_save(items)
65
+ items.each do |item|
66
+ ap "Parsing #{item['type']}"
67
+ result = parse_item(item)
68
+ save_item(item, result)
69
+ end
70
+ end
71
+
72
+ def save_item(item, result)
73
+ category = Category.find_or_create_by!(name: JSON.generate(en: item["category"]))
74
+ ap "and save to #{category.localized_name} category"
75
+ Parser.save result, {category_id: category.id}
76
+ end
77
+
78
+ def parse_item(item)
79
+ item["parser"].parse_all
80
+ end
81
+
82
+ def config
83
+ YAML.load_file(CONFIG_PATH)
84
+ end
85
+
86
+ def find_source
87
+ config["sources"].find {|source| source["enabled"] }
88
+ end
89
+
90
+ def logo
91
+ File.read(LOGO_PATH)
92
+ end
93
+
94
+ # choose parser for source here
95
+ def init_parser(source)
96
+ case source["source"]
97
+ when "fitness"
98
+ source["items"].each do |item|
99
+ item["parser"] = Parser::Exercise.new(item["url"], source: source) and next if item["type"] == "exercises"
100
+ item["parser"] = Parser::Fitness.new(item["url"], source: source)
101
+ end
102
+ when "news"
103
+ source["items"].each do |item|
104
+ item["parser"] = Parser::News.new(item["url"], source: source)
105
+ end
106
+ when "local"
107
+ if source["enabled"]
108
+ source["items"].each do |item|
109
+ item["parser"] = Parser::News.new(item["file"], {type: :file, source: source})
110
+ end
111
+ end
112
+ end
113
+ end
114
+
115
+ end
116
+
117
+ end
118
+
119
+ # astrology
120
+ # astro = Parser::Astrology.new('http://moj-znak-zodiaka.ru/')
121
+ # astro.parse_in("Гороскоп на сентябрь 2014","big")
122
+ # astro.parse_all
metadata ADDED
@@ -0,0 +1,154 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fly_parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Ruslan Korolev
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-10-25 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rails
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 4.1.4
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 4.1.4
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 1.6.3.1
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: 1.6.3.1
41
+ - !ruby/object:Gem::Dependency
42
+ name: mechanize
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: hashie
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: fakeweb
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: awesome_print
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: pry
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description: Simple fly parser for internal uses
112
+ email:
113
+ - rusik3@gmail.com
114
+ executables: []
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - lib/fly_parser.rb
119
+ - lib/fly_parser/base.rb
120
+ - lib/fly_parser/config_example.yml
121
+ - lib/fly_parser/logo.txt
122
+ - lib/fly_parser/mechanize_fix.rb
123
+ - lib/fly_parser/sources/astrology.rb
124
+ - lib/fly_parser/sources/exercise.rb
125
+ - lib/fly_parser/sources/fitness.rb
126
+ - lib/fly_parser/sources/news.rb
127
+ - lib/fly_parser/sources/sport.rb
128
+ - lib/fly_parser/version.rb
129
+ homepage: http://rubygems.org
130
+ licenses:
131
+ - MIT
132
+ metadata: {}
133
+ post_install_message:
134
+ rdoc_options: []
135
+ require_paths:
136
+ - lib
137
+ required_ruby_version: !ruby/object:Gem::Requirement
138
+ requirements:
139
+ - - '>='
140
+ - !ruby/object:Gem::Version
141
+ version: 2.0.0
142
+ required_rubygems_version: !ruby/object:Gem::Requirement
143
+ requirements:
144
+ - - '>='
145
+ - !ruby/object:Gem::Version
146
+ version: '0'
147
+ requirements: []
148
+ rubyforge_project:
149
+ rubygems_version: 2.4.1
150
+ signing_key:
151
+ specification_version: 4
152
+ summary: Fly parser
153
+ test_files: []
154
+ has_rdoc: