RubyGems - fly_parser - Versions diffs - 0.0.1 - Mend

fly_parser 0.0.1

Files changed (13) hide show

checksums.yaml +7 -0
data/lib/fly_parser/base.rb +46 -0
data/lib/fly_parser/config_example.yml +0 -0
data/lib/fly_parser/logo.txt +9 -0
data/lib/fly_parser/mechanize_fix.rb +53 -0
data/lib/fly_parser/sources/astrology.rb +67 -0
data/lib/fly_parser/sources/exercise.rb +33 -0
data/lib/fly_parser/sources/fitness.rb +36 -0
data/lib/fly_parser/sources/news.rb +53 -0
data/lib/fly_parser/sources/sport.rb +27 -0
data/lib/fly_parser/version.rb +3 -0
data/lib/fly_parser.rb +122 -0
metadata +154 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 95322be0f53123e2b8f9381e142e77a77556ce71
+  data.tar.gz: d04007e65f87581cfc2e4b364c03d48169132c1f
+SHA512:
+  metadata.gz: 6c277897c67c70bf11a884fb1d3df66cea1d919ff8818c195fc0c7a25daac0ff31dc1b1453b6ba8fd0e31ccc2e2911cf8207d9dad1f9f6b460489a10a4f1bc1d
+  data.tar.gz: e0b89b476a24b764bd684b186dd750387831989316a86758b93f75b5ec2a301182dcc8c01e0c35e57efb0e30e6a9a72160e59ccdd501567685ac75868804f2ee

data/lib/fly_parser/base.rb ADDED Viewed

@@ -0,0 +1,46 @@
+module Parser
+  # Html Base
+  class Base
+    def initialize(url, options)
+      @source = Parser.connect(url)
+      @copyright = copyright(options)
+      @limit_pages ||= 5
+      @delay ||= 10
+    end
+    def next_page(css_selector)
+      @next_page = @source.links_with(css_selector)[0]
+    end
+    def parse_all
+      result = parse_page
+      next_page()
+      #concat all pages into one array
+      i = 1
+      ap "Parsing #{i} page"
+      until @next_page.nil? || i == @limit_pages
+        sleep @delay
+        i += 1
+        ap "Parsing #{i} page"
+        @source = @next_page.click
+        next_page()
+        result.concat(parse_page)
+      end
+      result
+    end
+    def parse_page
+    end
+    def collect_between(first, last)
+      return nil if first.nil?
+      first == last ? [first] : [first, *collect_between(first.next, last)]
+    end
+    def copyright(options)
+      source = options[:source]
+      {
+        url: source['copyright'],
+        title: source['copyright_title']
+      }
+    end
+  end
+end

data/lib/fly_parser/config_example.yml ADDED Viewed

File without changes

data/lib/fly_parser/logo.txt ADDED Viewed

@@ -0,0 +1,9 @@
+        .---.        .-----------
+       /     \  __  /    ------
+      / /     \(  )/    -----
+     //////   ' \/ `   ---
+    //// / // :    : ---
+   // /   /  /`    '--
+  //          //..\\
+=============UU====UU====
+             '//||\\`      Fly Group Inc.

data/lib/fly_parser/mechanize_fix.rb ADDED Viewed

@@ -0,0 +1,53 @@
+class Mechanize::HTTP::Agent
+  MAX_RESET_RETRIES = 10
+  # We need to replace the core Mechanize HTTP method:
+  #
+  #   Mechanize::HTTP::Agent#fetch
+  #
+  # with a wrapper that handles the infamous "too many connection resets"
+  # Mechanize bug that is described here:
+  #
+  #   https://github.com/sparklemotion/mechanize/issues/123
+  #
+  # The wrapper shuts down the persistent HTTP connection when it fails with
+  # this error, and simply tries again. In practice, this only ever needs to
+  # be retried once, but I am going to let it retry a few times
+  # (MAX_RESET_RETRIES), just in case.
+  #
+  def fetch_with_retry(
+    uri,
+    method    = :get,
+    headers   = {},
+    params    = [],
+    referer   = current_page,
+    redirects = 0
+  )
+    action      = "#{method.to_s.upcase} #{uri.to_s}"
+    retry_count = 0
+    begin
+      fetch_without_retry(uri, method, headers, params, referer, redirects)
+    rescue Net::HTTP::Persistent::Error => e
+      # Pass on any other type of error.
+      raise unless e.message =~ /too many connection resets/
+      # Pass on the error if we've tried too many times.
+      if retry_count >= MAX_RESET_RETRIES
+        puts "**** WARN: Mechanize retried connection reset #{MAX_RESET_RETRIES} times and never succeeded: #{action}"
+        raise
+      end
+      # Otherwise, shutdown the persistent HTTP connection and try again.
+      puts "**** WARN: Mechanize retrying connection reset error: #{action}"
+      retry_count += 1
+      self.http.shutdown
+      retry
+    end
+  end
+  # Alias so #fetch actually uses our new #fetch_with_retry to wrap the
+  # old one aliased as #fetch_without_retry.
+  alias_method :fetch_without_retry, :fetch
+  alias_method :fetch, :fetch_with_retry
+end

data/lib/fly_parser/sources/astrology.rb ADDED Viewed

@@ -0,0 +1,67 @@
+require 'hashie'
+module Parser
+  class Astrology
+    def initialize(source)
+      @zodiacs = ['Овен','Телец','Близнецы','Рак','Лев','Дева','Весы','Скорпион','Стрелец','Козерог','Водолей','Рыбы']
+      @source = Parser.connect(source)
+      small_titles = ["Гороскоп на сегодня", "Гороскоп на завтра", "Гороскоп на неделю"]
+      big_titles = ["Гороскопы на месяц", "Гороскоп на 2014 год", "Гороскоп на 2014 год зеленой Лошади", "Гороскоп на сентябрь 2014"]
+      @titles = Hashie::Mash.new
+      @titles.small = small_titles
+      @titles.big = big_titles
+    end
+    def parse_in(text = "Гороскоп на сегодня", date = 'small')
+      @text = text
+      @date = date
+      parse_content
+    end
+    def parse_content
+      zodiac_links.map do |item|
+        link = item.link
+        zodiac = item.zodiac
+        @page = Parser.http(link.value)
+        content = (@date == 'small' ? parse_small : parse_big)
+        result = Hashie::Mash.new
+        result.zodiac = zodiac
+        result.content = content
+        result
+      end
+    end
+    def parse_all
+      small_content = @titles.small.map { |title| {title: title, content: parse_in(title,"small")} }
+      big_content = @titles.big.map { |title| {title: title, content: parse_in(title,"big")} }
+      small_content.concat big_content
+    end
+    private
+    def current_page
+      @source.link_with(:text => @text).click
+    end
+    def zodiac_links
+      @zodiacs.map do |z|
+        result = Hashie::Mash.new
+        result.link = current_page.search("a:contains('#{z}')")[0].attributes["href"]
+        result.zodiac = z
+        result
+      end
+    end
+    def parse_small
+      @page.css('#main').children().reject do |el|
+        (el.attributes['class'].value == 'lp50' || el.attributes['class'].value == 'rp50' || el.attributes['class'].value == "space" if el.attributes['class'] != nil) || ['img','br','b','h1'].include?(el.name) || ["\n","\n\n"].include?(el.text)
+      end.join
+    end
+    def parse_big
+      @page.css('#main .lp50').text()
+    end
+  end
+end

data/lib/fly_parser/sources/exercise.rb ADDED Viewed

@@ -0,0 +1,33 @@
+module Parser
+  class Exercise < Base
+    def initialize(url, options)
+      @delay = 5
+      super
+    end
+    def next_page
+      super(:class => 'nextpostslink')
+    end
+    def parse_page
+      #until next page exists
+      links = @source.links_with(:class => 'readmore')
+      links.map do |link|
+        page = link.click
+        article = page.search('.post')
+        title = article.search('.title').text()
+        wrapper = article.search('.entry')
+        wrapper.search('a').remove_attr('href')
+        poster_image = wrapper.search('img').first.attributes['src'].value
+        start_element = wrapper.at('p:nth-child(2)')
+        end_element = wrapper.xpath("//comment()[. = ' adman_adcode_after ']").first
+        next if start_element.next.nil?
+        content = collect_between(start_element,end_element).map(&:to_s).join
+        next if content.nil?
+        {title: title, content: content, poster_image: poster_image}
+      end.compact
+    end
+  end
+end

data/lib/fly_parser/sources/fitness.rb ADDED Viewed

@@ -0,0 +1,36 @@
+# todo parse all pages first, and download only newest later
+module Parser
+  class Fitness < Base
+    def next_page
+      super(:id => 'next_page')
+    end
+    def parse_page
+      # until next page exists
+      links = @source.links_with(:class => 'article-headine__link')
+      links.map do |link|
+        href = link.href
+        page = link.click
+        article = page.search('div[itemscope]')
+        title = article.search('.article-name').text()
+        # images stuff
+        wrapper = article.search('.article-text__wrapper')
+        # if poster_image doesn't exists, it looks like strange promo ads, so skip it
+        next if wrapper.search('.article_image__pic').first.nil?
+        poster_image = @copyright[:url] + wrapper.search('.article_image__pic').first.attributes['src'].value
+        wrapper.search('.article_image__pic').first.remove()
+        wrapper.search('.article_image__pic')
+        images = wrapper.search('.article_image__pic')
+        # expand path for images
+        images.map {|image| image.attributes['src'].value = image.attributes['src'].value.prepend(@copyright[:url]); image }
+        # remove ad
+        wrapper.search('.po_theme').remove()
+        wrapper.search('a').remove_attr('href')
+        {title: title, content: wrapper.inner_html, poster_image: poster_image}
+      end.compact
+    end
+  end
+end

data/lib/fly_parser/sources/news.rb ADDED Viewed

@@ -0,0 +1,53 @@
+module Parser
+  class News
+    def initialize(source, options = {})
+      if options[:type] == :file
+        source = fake_url(source)
+      end
+      @copyright = copyright(options)
+      @source = Parser.connect(source)
+      @delay ||= 10
+    end
+    def fake_url(source)
+      stream = File.read(source)
+      # test_file.com is a random url, just for Mechanize parsing
+      url = "http://www.google.com"
+      FakeWeb.register_uri(:get, url, :body => stream, :content_type => "application/xml")
+      url
+    end
+    def copyright(options)
+      source = options[:source]
+      {
+        url: source['copyright'],
+        title: source['copyright_title']
+      }
+    end
+    def parse_all
+      items = @source.search('//item')
+      last_date = Time.now - 2.years # for dev 2 years
+      # select! or reject! is not exists for Nokogiri#NodeSet
+      items = items.select {|item| item.xpath('pubDate').first.content() > last_date }
+      items.map do |item|
+        title = item.xpath('title/text()').text()
+        date = item.xpath('pubDate').first.content()
+        link = item.xpath('link/text()').text()
+        page = Nokogiri::HTML(open(link))
+        next if page.search('.article_illustration img').first.nil?
+        poster_image = page.search('.article_illustration img').first.attributes['src'].value
+        short_desc = page.search('.article_lead').first().content()
+        full_desc = page.search('.article_full_text')
+        full_desc.search('.article_illustration').remove()
+        full_desc.search('.inject-data').remove()
+        full_desc.search('a').remove()
+        copyright = "<p>Источник: <a href='#{@copyright[:url]}'>#{@copyright[:title]}</a></p>"
+        content = "<p>#{short_desc}</p>" + full_desc.inner_html + copyright
+        {title: title, content: content, poster_image: poster_image}
+      end.compact
+    end
+  end
+end

data/lib/fly_parser/sources/sport.rb ADDED Viewed

@@ -0,0 +1,27 @@
+module Parser
+  class Sport
+    def initialize(source)
+      @source = Parser.connect(source)
+      @categories = %w(Футбол Хоккей Баскетбол Авто/мото Теннис)
+    end
+    def parse_in(category)
+      @category = category
+      @current_page = current_page
+      @current_page.search('.anons').map do |news_item|
+        href = news_item.css('a').first.attributes['href'].value
+        item_page = @current_page.link_with(:href => href).click
+        { title: item_page.search('.titleH1').text(), content: item_page.search('.article-textBlock').text() }
+      end
+    end
+    def current_page
+      @source.link_with(:text => @category).click
+    end
+    def parse_all
+      @categories.map { |category| parse_in(category) }
+    end
+  end
+end

data/lib/fly_parser/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Parser
+  VERSION = '0.0.1'
+end

data/lib/fly_parser.rb ADDED Viewed

@@ -0,0 +1,122 @@
+# Little library for parsing articles
+# chmod +x lib/run_parser.rb
+# run from Rails root bundle exec lib/run_parser.rb
+require 'pry'
+require 'open-uri'
+require 'yaml'
+BASE_PATH = File.expand_path("fly_parser/base", File.dirname(__FILE__))
+LOGO_PATH = File.expand_path("fly_parser/logo.txt", File.dirname(__FILE__))
+MECHANIZE_FIX = File.expand_path("fly_parser/mechanize_fix", File.dirname(__FILE__))
+require BASE_PATH
+Dir.chdir RAILS_ROOT
+require RAILS_BOOT_PATH
+require RAILS_CONFIG_PATH
+Pry.config.print = proc { |output, value| output.puts value.ai }
+# Require all of the Ruby files in the given directory.
+#
+# path - The String relative path from here to the directory.
+#
+# Returns nothing.
+def require_all(path)
+  glob = File.expand_path(File.join(File.dirname(__FILE__), path, '**', '*.rb'))
+  Dir[glob].each do |f|
+    require f
+  end
+end
+require_all 'fly_parser/sources'
+# fix mechanize by monkey-patching :)
+require MECHANIZE_FIX
+module Parser
+  class << self
+    # Get HTTP Source
+    def http(url)
+      Nokogiri::HTML(open(url))
+    end
+    def connect(url)
+      agent = Mechanize.new
+      agent.get(url)
+    end
+    def save(articles, options)
+      articles.each do |article|
+        item = Article.new(title: article[:title], content: article[:content])
+        item.categories = [Category.find(options[:category_id])]
+        item.remote_image_url = article[:poster_image]
+        item.save!
+      end
+    end
+    def start
+      puts logo
+      source = find_source
+      init_parser(source)
+      parse_and_save(source["items"])
+    end
+    def parse_and_save(items)
+      items.each do |item|
+        ap "Parsing #{item['type']}"
+        result = parse_item(item)
+        save_item(item, result)
+      end
+    end
+    def save_item(item, result)
+      category = Category.find_or_create_by!(name: JSON.generate(en: item["category"]))
+      ap "and save to #{category.localized_name} category"
+      Parser.save result, {category_id: category.id}
+    end
+    def parse_item(item)
+      item["parser"].parse_all
+    end
+    def config
+      YAML.load_file(CONFIG_PATH)
+    end
+    def find_source
+      config["sources"].find {|source| source["enabled"] }
+    end
+    def logo
+      File.read(LOGO_PATH)
+    end
+    # choose parser for source here
+    def init_parser(source)
+      case source["source"]
+      when "fitness"
+        source["items"].each do |item|
+          item["parser"] = Parser::Exercise.new(item["url"], source: source) and next if item["type"] == "exercises"
+          item["parser"] = Parser::Fitness.new(item["url"], source: source)
+        end
+      when "news"
+        source["items"].each do |item|
+          item["parser"] = Parser::News.new(item["url"], source: source)
+        end
+      when "local"
+        if source["enabled"]
+          source["items"].each do |item|
+            item["parser"] = Parser::News.new(item["file"], {type: :file, source: source})
+          end
+        end
+      end
+    end
+  end
+end
+# astrology
+# astro = Parser::Astrology.new('http://moj-znak-zodiaka.ru/')
+# astro.parse_in("Гороскоп на сентябрь 2014","big")
+# astro.parse_all

metadata ADDED Viewed

@@ -0,0 +1,154 @@
+--- !ruby/object:Gem::Specification
+name: fly_parser
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Ruslan Korolev
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-10-25 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rails
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 4.1.4
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 4.1.4
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.6.3.1
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.6.3.1
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: hashie
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: fakeweb
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: awesome_print
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: pry
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: Simple fly parser for internal uses
+email:
+- rusik3@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/fly_parser.rb
+- lib/fly_parser/base.rb
+- lib/fly_parser/config_example.yml
+- lib/fly_parser/logo.txt
+- lib/fly_parser/mechanize_fix.rb
+- lib/fly_parser/sources/astrology.rb
+- lib/fly_parser/sources/exercise.rb
+- lib/fly_parser/sources/fitness.rb
+- lib/fly_parser/sources/news.rb
+- lib/fly_parser/sources/sport.rb
+- lib/fly_parser/version.rb
+homepage: http://rubygems.org
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: 2.0.0
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.4.1
+signing_key:
+specification_version: 4
+summary: Fly parser
+test_files: []
+has_rdoc: