uni_parser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 71662cdc320d077e5fd51cbd681a8f1fe7ac4713
4
+ data.tar.gz: e3104649b65113a2405677b2a7d7995f6fe11e1b
5
+ SHA512:
6
+ metadata.gz: 062e3f2b1f8f87656bd73127b312e7a844b394caab75dcf13737f0c5828a97ce49af9628487ddfa8d954562605470603b384af5f59e20993f50d8813fcb2bcd8
7
+ data.tar.gz: 33f22bc846b5a49408162bf2a4abf2b511ec2670ea4f226616de7967a1384d0a8248170d6fbd1df5f82c9c6c1c1d4407f24c0005149416355f9bfcf9e73b0d41
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in uni_parser.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Artem Petrov
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,31 @@
1
+ # UniParser
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'uni_parser'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install uni_parser
20
+
21
+ ## Usage
22
+
23
+ TODO: Write usage instructions here
24
+
25
+ ## Contributing
26
+
27
+ 1. Fork it ( https://github.com/[my-github-username]/uni_parser/fork )
28
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
29
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
30
+ 4. Push to the branch (`git push origin my-new-feature`)
31
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rake/testtask'
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << "test"
6
+ t.pattern = 'test/**/*_test.rb'
7
+ end
8
+
9
+ task :default => :test
@@ -0,0 +1,7 @@
1
+ module RoundRobin
2
+ def next!(arry)
3
+ value = arry.shift
4
+ arry << value
5
+ value
6
+ end
7
+ end
@@ -0,0 +1,87 @@
1
+ module UniParser
2
+ class Agent
3
+ include RoundRobin
4
+
5
+ AGENT_ALIASES = ['Mac Safari', 'Mac Firefox', 'Linux Firefox', 'Linux Mozilla',
6
+ 'Windows IE 8', 'Windows IE 9', 'Windows Mozilla']
7
+
8
+ attr_reader :current_proxy, :options
9
+
10
+ def initialize(options = {})
11
+ @agent = Mechanize.new { |agent|
12
+ agent.open_timeout = UniParser.config.open_timeout if UniParser.config.open_timeout
13
+ agent.read_timeout = UniParser.config.read_timeout if UniParser.config.read_timeout
14
+ }
15
+ @options = options
16
+ @proxy = Proxy.new(options[:proxy]) if options[:proxy]
17
+ end
18
+
19
+ def history
20
+ @agent.history
21
+ end
22
+
23
+ def get(url, parameters = [], referer = nil, headers = {})
24
+ @agent.reset
25
+ @agent.follow_redirect = false
26
+
27
+ set_proxy
28
+ set_user_agent
29
+
30
+ begin
31
+ @page = @agent.get(url, [], referer)
32
+ raise if @page.code == '302'
33
+ rescue => e
34
+ p e
35
+ freeze_current_proxy
36
+ raise e
37
+ end
38
+
39
+ @page.body
40
+ end
41
+
42
+ def get_file(url)
43
+ begin
44
+ file = @agent.get url
45
+ rescue => e
46
+ freeze_current_proxy
47
+ raise e
48
+ end
49
+
50
+ ext = file.filename.split('.').last
51
+ if file.body_io.instance_of?(Tempfile)
52
+ body = StringIO.new(file.body_io.read)
53
+ create_temp(file.filename, body, ext)
54
+ else
55
+ create_temp(file.filename, file.body_io, ext)
56
+ end
57
+ end
58
+
59
+ def create_temp(name, body, ext)
60
+ temp_file = Tempfile.new([name, ".#{ext}"])
61
+ temp_file.binmode
62
+ temp_file.write body.string
63
+ temp_file.flush
64
+ temp_file
65
+ end
66
+
67
+ def set_proxy
68
+ return if @options[:proxy] === false
69
+
70
+ return unless @proxy || UniParser.config.use_proxy?
71
+
72
+ @current_proxy = @proxy || UniParser.config.proxy_list.next
73
+ @agent.set_proxy current_proxy.host, current_proxy.port
74
+ end
75
+
76
+ def set_user_agent
77
+ @agent.user_agent_alias = next! AGENT_ALIASES
78
+ end
79
+
80
+ private
81
+
82
+ def freeze_current_proxy
83
+ UniParser.config.proxy_list.freeze_proxy current_proxy
84
+ end
85
+
86
+ end
87
+ end
@@ -0,0 +1,19 @@
1
+ module UniParser
2
+ class Config
3
+ attr_accessor :proxy_list, :proxy_freeze_time, :read_timeout, :open_timeout
4
+
5
+ def proxy_list=(url_list)
6
+ @proxy_list = UniParser::ProxyList.new
7
+
8
+ url_list.each do |url|
9
+ proxy = Proxy.new url
10
+ @proxy_list << proxy
11
+ end
12
+ end
13
+
14
+ def use_proxy?
15
+ proxy_list && proxy_list.any?
16
+ end
17
+
18
+ end
19
+ end
@@ -0,0 +1,72 @@
1
+ # encoding: utf-8
2
+ module UniParser
3
+
4
+ class Page
5
+ attr_accessor :result
6
+
7
+ def self.field(name, &block)
8
+ if block_given?
9
+ attr_writer name.to_sym
10
+ send :define_method, name.to_sym do
11
+ unless instance_variable_defined?("@#{name}")
12
+ value = instance_eval(&block)
13
+ value = clean_up(value) if value.is_a?(String)
14
+ instance_variable_set("@#{name}", value)
15
+ end
16
+ instance_variable_get("@#{name}")
17
+ end
18
+ else
19
+ attr_accessor name.to_sym
20
+ end
21
+ end
22
+
23
+ # @param [Object] options
24
+ # @option options [String] :source Сырой HTML в виде строки
25
+ # @option options [String] :url Урла
26
+ def initialize(options = {})
27
+ @source = options.delete(:source)
28
+ @url = options.delete(:url)
29
+ end
30
+
31
+ def agent
32
+ options = {
33
+ proxy: @proxy
34
+ }
35
+
36
+ @agent ||= UniParser::Agent.new options
37
+ end
38
+
39
+ def source
40
+ @source ||= agent.get(@url) || throw('Define source or url... =(')
41
+ rescue Exception => e
42
+ raise ConnectionError, "#{e.to_s}, #{agent.history.inspect}", e.backtrace
43
+ end
44
+
45
+ def json
46
+ return source if source.is_a?(Hash)
47
+ @json ||= MultiJson.decode source if source
48
+ rescue Exception => e
49
+ raise DecodeError, "#{e.to_s}, #{agent.history.inspect}", e.backtrace
50
+ end
51
+
52
+ def root
53
+ @root ||= Nokogiri::HTML(source, 'utf-8')
54
+ end
55
+
56
+ def file(file_url)
57
+ agent.get_file file_url
58
+ end
59
+
60
+ def clean_up(str)
61
+ str.strip.sub(/^\.{3}/, '').sub(/\,$/, '').gsub("\u00A0", "\u0020").strip
62
+ end
63
+
64
+ def format_date(str)
65
+ ['января', 'февраля', 'марта', 'апреля',
66
+ 'мая', 'июня', 'июля', 'августа', 'сентября',
67
+ 'октября', 'ноября', 'декабря'].each_with_index { |month, i| str = str.sub(month, (i + 1).to_s) }
68
+ str.strip.gsub(/[^\d]+/, '-')
69
+ end
70
+
71
+ end
72
+ end
@@ -0,0 +1,22 @@
1
+ module UniParser
2
+ class Proxy
3
+ attr_reader :host, :port
4
+
5
+ def initialize(url)
6
+ uri = URI url
7
+ @host, @port = uri.host, uri.port
8
+ end
9
+
10
+ def freeze
11
+ @frozen_to = Time.now + freeze_time
12
+ end
13
+
14
+ def freeze_time
15
+ UniParser.config.proxy_freeze_time || 1.minute
16
+ end
17
+
18
+ def frozen?
19
+ Time.now < @frozen_to
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,42 @@
1
+ module UniParser
2
+ class ProxyList
3
+ include Enumerable
4
+ include RoundRobin
5
+
6
+ attr_reader :frozen_proxies
7
+ attr_reader :proxies
8
+
9
+ def initialize(proxies = [])
10
+ @proxies = proxies
11
+ @frozen_proxies = []
12
+ end
13
+
14
+ def each(&block)
15
+ @proxies.each(&block)
16
+ end
17
+
18
+ def <<(proxy)
19
+ @proxies << proxy
20
+ end
21
+
22
+ def next
23
+ unfreeze_proxies
24
+ next! @proxies
25
+ end
26
+
27
+ def freeze_proxy(proxy)
28
+ return unless proxy
29
+ @proxies.delete proxy
30
+ proxy.freeze
31
+ @frozen_proxies << proxy unless proxy.in?(@frozen_proxies)
32
+ end
33
+
34
+ def unfreeze_proxies
35
+ @frozen_proxies.each do |proxy|
36
+ return if proxy.frozen?
37
+ @frozen_proxies.delete proxy
38
+ @proxies << proxy
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,3 @@
1
+ module UniParser
2
+ VERSION = "0.0.1"
3
+ end
data/lib/uni_parser.rb ADDED
@@ -0,0 +1,47 @@
1
+ require 'active_support/core_ext'
2
+ require 'mechanize'
3
+ require 'multi_json'
4
+ require 'round_robin'
5
+
6
+ require 'uni_parser/version'
7
+ require 'uni_parser/config'
8
+ require 'uni_parser/agent'
9
+ require 'uni_parser/proxy'
10
+ require 'uni_parser/proxy_list'
11
+
12
+ require 'uni_parser/page'
13
+ require 'uni_parser/pages/base'
14
+ require 'uni_parser/pages/book'
15
+
16
+ module UniParser
17
+ Encoding.default_external = Encoding::UTF_8
18
+
19
+ def self.configure
20
+ @config = UniParser::Config.new
21
+ yield @config
22
+ end
23
+
24
+ def self.config
25
+ @config ||= UniParser::Config.new
26
+ end
27
+
28
+ def self.config_reset
29
+ @config = UniParser::Config.new
30
+ end
31
+
32
+ class BaseError < StandardError
33
+ end
34
+
35
+ class TypeError < BaseError
36
+ end
37
+
38
+ class DecodeError < BaseError
39
+ end
40
+
41
+ class ConnectionError < BaseError
42
+ end
43
+
44
+ class InitError < BaseError
45
+ end
46
+
47
+ end
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'uni_parser/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "uni_parser"
8
+ spec.version = UniParser::VERSION
9
+ spec.authors = ["Artem Petrov"]
10
+ spec.email = ["partos0511@gmail.com"]
11
+ spec.summary = %q{Uniparser}
12
+ spec.description = %q{Universal parser, you must only describe a page wit nokogiri}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_runtime_dependency "nokogiri", "~> 1.6.1"
22
+ spec.add_runtime_dependency "mechanize", "2.7.1"
23
+ spec.add_runtime_dependency "activesupport"
24
+ spec.add_runtime_dependency "multi_json", "~> 1.9"
25
+
26
+ spec.add_development_dependency "bundler", "~> 1.7"
27
+ spec.add_development_dependency "rake", "~> 10.0"
28
+ end
metadata ADDED
@@ -0,0 +1,143 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: uni_parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Artem Petrov
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-12-30 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 1.6.1
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 1.6.1
27
+ - !ruby/object:Gem::Dependency
28
+ name: mechanize
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '='
32
+ - !ruby/object:Gem::Version
33
+ version: 2.7.1
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '='
39
+ - !ruby/object:Gem::Version
40
+ version: 2.7.1
41
+ - !ruby/object:Gem::Dependency
42
+ name: activesupport
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: multi_json
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.9'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.9'
69
+ - !ruby/object:Gem::Dependency
70
+ name: bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.7'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.7'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '10.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '10.0'
97
+ description: Universal parser, you must only describe a page wit nokogiri
98
+ email:
99
+ - partos0511@gmail.com
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - ".gitignore"
105
+ - Gemfile
106
+ - LICENSE.txt
107
+ - README.md
108
+ - Rakefile
109
+ - lib/round_robin.rb
110
+ - lib/uni_parser.rb
111
+ - lib/uni_parser/agent.rb
112
+ - lib/uni_parser/config.rb
113
+ - lib/uni_parser/page.rb
114
+ - lib/uni_parser/proxy.rb
115
+ - lib/uni_parser/proxy_list.rb
116
+ - lib/uni_parser/version.rb
117
+ - uni_parser.gemspec
118
+ homepage: ''
119
+ licenses:
120
+ - MIT
121
+ metadata: {}
122
+ post_install_message:
123
+ rdoc_options: []
124
+ require_paths:
125
+ - lib
126
+ required_ruby_version: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ required_rubygems_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ requirements: []
137
+ rubyforge_project:
138
+ rubygems_version: 2.4.3
139
+ signing_key:
140
+ specification_version: 4
141
+ summary: Uniparser
142
+ test_files: []
143
+ has_rdoc: