pagedump 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0d237dc01548838acdbf7913b9b5fac0300b1e61
4
+ data.tar.gz: eb81b3546ee1a91f1f330dec98ddfb02224cb7b7
5
+ SHA512:
6
+ metadata.gz: ce9df7137158f4df8629ef06e9e3159f72953db34c7844a9121d86c8d238f56b8dd30a88e1e8bd8f706638db9e6733ee72d81cf9bd8d39f186c39a91e5adf5fe
7
+ data.tar.gz: 3a58f23282572bacf3208f93b3f2509f53c80398968a128fcbdc6492ab28f91f1af004387e01c26f98aa4843aa79be72dd058297669bfedf62ee10960ccfe505
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
@@ -0,0 +1,12 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.3.0
4
+ before_install: gem install bundler -v 1.11.2
5
+ deploy:
6
+ provider: rubygems
7
+ api_key:
8
+ secure: O5Q8XZmw0jljpG312fJGdBXBEB8IP1N9xhqBV1nwVGTI+L7ANVwzoAQkYtxFkDhttzXCgzgdsXfw43vuMGWbZ0dnspwZoEReYcsa9kAwtBGX/WvRovtFmzw1AWZSDUvo63+KMQFB8cGzLibYUDXJwz177UdiRss2oijIrMwX8YHeLv607HvaTAjSSbMkX+G2G8D7Wbbhlz752FuK+wxtrgez/8ba9xHBhYvB8VmYxqQl0Wzq9UEHB23nyraZTurJpZk0XKyn8210Sov/Cf2YsVVL1wZEusamwyztU4x4RpEhgPC8UKl7ktELkDfjY3t+PHBeP5BXprbsQTV3QuB1P+oX83ig3r8vpMaewlrmAnkU9P1kA64V1zn/HuDFc0CfQQVHkkp0xdJxW9D9NDd+Y87JAUge0OLddKDeUQVxrunGZ1hNvqPanRfSosEKKYS/apRo5W3Fo6e0rRV63FTnKRyaV9eactDL7rDZM7pn1LzsD9iGi5mhgMO5ZjrvGFuMqT78772PXSZAycF6x2fwhVSw9T8/OpMzqYeX8+7PGa3h5RDw1IHOWdV6At26vRq270F8ONOftIux2z0xDOqinH6uJLuKuGpf4jk7XZkcdEZlLbZHa6cpI28kyogrc6bIbEKACVGtbUQRVak9Y8kcWpHzgt2PlZuXOXD7d4tCWH4=
9
+ gem: pagedump
10
+ on:
11
+ tags: true
12
+ repo: pompadour/pagedump
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in pagedump.gemspec
4
+ gemspec
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Damien Cram
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,65 @@
1
+ # Pagedump
2
+
3
+ ## Installation
4
+
5
+ Add this line to your application's Gemfile:
6
+
7
+ ```ruby
8
+ gem 'pagedump'
9
+ ```
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install pagedump
18
+
19
+ ## Usage
20
+
21
+ Create a page driver:
22
+
23
+ ``` ruby
24
+ require "pagedump"
25
+
26
+ class LeMonde < Pagedump::Driver
27
+ URL = "http://www.lemonde.fr/"
28
+
29
+ def headlines page
30
+ head 3, page.css(".titre_une a")[0]['href']
31
+
32
+ page.css(".titres_hauts article").each do |e|
33
+ head 1, e.css('a')[0]["href"]
34
+ end
35
+ end
36
+ end
37
+ ```
38
+
39
+ And scrap its links:
40
+
41
+ ``` ruby
42
+ require "pagedump"
43
+
44
+ healines = @driver.scrap
45
+ healines.each do |headline, w|
46
+ puts "%3d\t%-s" % [w, headline]
47
+ end
48
+
49
+ ```
50
+
51
+
52
+ ## Development
53
+
54
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
55
+
56
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
57
+
58
+ ## Contributing
59
+
60
+ Bug reports and pull requests are welcome on GitHub at https://github.com/pompadour/pagedump.
61
+
62
+
63
+ ## License
64
+
65
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "pagedump"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,30 @@
1
+ require 'logging'
2
+ require 'mechanize'
3
+ require "pagedump/version"
4
+ require "pagedump/driver"
5
+
6
+ module Pagedump
7
+ class << self
8
+ def logger
9
+ Logging.logger[Pagedump]
10
+ end
11
+ def load_drivers path
12
+ Dir[File.join(path, "**/*.rb")].each do |p|
13
+ require p
14
+ end
15
+ end
16
+ def register_driver driver_cls
17
+ drivers[driver_cls.name] = driver_cls
18
+ end
19
+ def drivers
20
+ @drivers ||= {}
21
+ end
22
+ def driver driver_name
23
+ if (cls = drivers[driver_name]).nil?
24
+ raise("No such driver class: #{driver_name}")
25
+ else
26
+ cls.new
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,38 @@
1
+ module Pagedump
2
+ # WRANING !!
3
+ # No Thread-Safe
4
+ class Driver
5
+ attr_reader :headlines
6
+
7
+ def self.inherited(subclass)
8
+ Pagedump.register_driver subclass
9
+ puts "New driver: #{subclass}"
10
+ end
11
+
12
+ def initialize
13
+ @wlinks = {}
14
+ end
15
+
16
+ def link weight, href
17
+ abs_link = URI.parse(url).merge(URI.parse(href)).to_s
18
+ @wlinks[abs_link] = weight
19
+ end
20
+
21
+ def scrap
22
+ @wlinks = {}
23
+ Pagedump.logger.info "Getting headlines for url #{url}"
24
+ agent = Mechanize.new
25
+ page = agent.get(url)
26
+ self.links page
27
+ @wlinks
28
+ end
29
+
30
+ def url
31
+ self.class::URL
32
+ end
33
+
34
+ def name
35
+ self.class.name
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,3 @@
1
+ module Pagedump
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'pagedump/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "pagedump"
8
+ spec.version = Pagedump::VERSION
9
+ spec.authors = ["Damien Cram"]
10
+ spec.email = ["damien.cram@laposte.net"]
11
+
12
+ spec.summary = %q{An API to scrap pages on the web and to extract selected links from it.}
13
+ spec.description = %q{An API to scrap pages on the web and to extract selected links from it.}
14
+ spec.homepage = "http://github.com/pompadour/pagedump/"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_dependency "mechanize", "~> 2.7"
23
+ spec.add_dependency "logging", "~> 2.1"
24
+
25
+ spec.add_development_dependency "bundler", "~> 1.11"
26
+ spec.add_development_dependency "rake", "~> 10.0"
27
+ spec.add_development_dependency "rspec", "~> 3.0"
28
+ end
metadata ADDED
@@ -0,0 +1,127 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pagedump
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Damien Cram
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-05-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: mechanize
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2.7'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: logging
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.1'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.1'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.11'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.11'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '10.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '10.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '3.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '3.0'
83
+ description: An API to scrap pages on the web and to extract selected links from it.
84
+ email:
85
+ - damien.cram@laposte.net
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - ".gitignore"
91
+ - ".rspec"
92
+ - ".travis.yml"
93
+ - Gemfile
94
+ - LICENSE.txt
95
+ - README.md
96
+ - Rakefile
97
+ - bin/console
98
+ - bin/setup
99
+ - lib/pagedump.rb
100
+ - lib/pagedump/driver.rb
101
+ - lib/pagedump/version.rb
102
+ - pagedump.gemspec
103
+ homepage: http://github.com/pompadour/pagedump/
104
+ licenses:
105
+ - MIT
106
+ metadata: {}
107
+ post_install_message:
108
+ rdoc_options: []
109
+ require_paths:
110
+ - lib
111
+ required_ruby_version: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ version: '0'
116
+ required_rubygems_version: !ruby/object:Gem::Requirement
117
+ requirements:
118
+ - - ">="
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ requirements: []
122
+ rubyforge_project:
123
+ rubygems_version: 2.5.1
124
+ signing_key:
125
+ specification_version: 4
126
+ summary: An API to scrap pages on the web and to extract selected links from it.
127
+ test_files: []