metainspector 1.13.1 → 1.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +11 -3
 - data/lib/meta_inspector/scraper.rb +4 -1
 - data/lib/meta_inspector/version.rb +1 -1
 - data/meta_inspector.gemspec +1 -1
 - data/spec/metainspector_spec.rb +15 -0
 - metadata +9 -9
 
    
        data/README.md
    CHANGED
    
    | 
         @@ -28,6 +28,10 @@ If you don't include the scheme on the URL, http:// will be used by default: 
     | 
|
| 
       28 
28 
     | 
    
         | 
| 
       29 
29 
     | 
    
         
             
                page = MetaInspector.new('markupvalidator.com')
         
     | 
| 
       30 
30 
     | 
    
         | 
| 
      
 31 
     | 
    
         
            +
            You can also include the html which will be used as the document to scrape:
         
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
      
 33 
     | 
    
         
            +
                page = MetaInspector.new("http://markupvalidator.com", :document => "<html><head><title>Hello From Passed Html</title><a href='/hello'>Hello link</a></head><body></body></html>")
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
       31 
35 
     | 
    
         
             
            ## Accessing scraped data
         
     | 
| 
       32 
36 
     | 
    
         | 
| 
       33 
37 
     | 
    
         
             
            Then you can see the scraped data like this:
         
     | 
| 
         @@ -70,9 +74,13 @@ You can also access most of the scraped data as a hash: 
     | 
|
| 
       70 
74 
     | 
    
         
             
                page.to_hash  # { "url"   => "http://markupvalidator.com",
         
     | 
| 
       71 
75 
     | 
    
         
             
                                  "title" => "MarkupValidator :: site-wide markup validation tool", ... }
         
     | 
| 
       72 
76 
     | 
    
         | 
| 
       73 
     | 
    
         
            -
            The  
     | 
| 
      
 77 
     | 
    
         
            +
            The original document is accessible from:
         
     | 
| 
      
 78 
     | 
    
         
            +
             
     | 
| 
      
 79 
     | 
    
         
            +
                page.document         # A String with the contents of the HTML document
         
     | 
| 
      
 80 
     | 
    
         
            +
             
     | 
| 
      
 81 
     | 
    
         
            +
            And the full scraped document is accessible from:
         
     | 
| 
       74 
82 
     | 
    
         | 
| 
       75 
     | 
    
         
            -
                page. 
     | 
| 
      
 83 
     | 
    
         
            +
                page.parsed_document  # Nokogiri doc that you can use it to get any element from the page
         
     | 
| 
       76 
84 
     | 
    
         | 
| 
       77 
85 
     | 
    
         
             
            ## Options
         
     | 
| 
       78 
86 
     | 
    
         | 
| 
         @@ -166,4 +174,4 @@ Thanks to all the contributors: 
     | 
|
| 
       166 
174 
     | 
    
         | 
| 
       167 
175 
     | 
    
         
             
            [https://github.com/jaimeiniesta/metainspector/graphs/contributors](https://github.com/jaimeiniesta/metainspector/graphs/contributors)
         
     | 
| 
       168 
176 
     | 
    
         | 
| 
       169 
     | 
    
         
            -
            Copyright (c) 2009-2012 Jaime Iniesta, released under the MIT license
         
     | 
| 
      
 177 
     | 
    
         
            +
            Copyright (c) 2009-2012 Jaime Iniesta, released under the MIT license
         
     | 
| 
         @@ -16,8 +16,10 @@ module MetaInspector 
     | 
|
| 
       16 
16 
     | 
    
         
             
                # Options:
         
     | 
| 
       17 
17 
     | 
    
         
             
                # => timeout: defaults to 20 seconds
         
     | 
| 
       18 
18 
     | 
    
         
             
                # => html_content_type_only: if an exception should be raised if request content-type is not text/html. Defaults to false
         
     | 
| 
       19 
     | 
    
         
            -
                # => allow_safe_redirections: 
     | 
| 
      
 19 
     | 
    
         
            +
                # => allow_safe_redirections: if redirects from http to https sites on the same domain should be allowed or not
         
     | 
| 
       20 
20 
     | 
    
         
             
                # => allow_unsafe_redirections: if redirects from https to http sites on the same domain should be allowed or not
         
     | 
| 
      
 21 
     | 
    
         
            +
                # => document: the html of the url as a string
         
     | 
| 
      
 22 
     | 
    
         
            +
                # => verbose: if the errors should be logged to the screen
         
     | 
| 
       21 
23 
     | 
    
         
             
                def initialize(url, options = {})
         
     | 
| 
       22 
24 
     | 
    
         
             
                  options   = defaults.merge(options)
         
     | 
| 
       23 
25 
     | 
    
         | 
| 
         @@ -32,6 +34,7 @@ module MetaInspector 
     | 
|
| 
       32 
34 
     | 
    
         
             
                  @allow_safe_redirections    = options[:allow_safe_redirections]
         
     | 
| 
       33 
35 
     | 
    
         
             
                  @allow_unsafe_redirections  = options[:allow_unsafe_redirections]
         
     | 
| 
       34 
36 
     | 
    
         
             
                  @verbose                    = options[:verbose]
         
     | 
| 
      
 37 
     | 
    
         
            +
                  @document                   = options[:document]
         
     | 
| 
       35 
38 
     | 
    
         
             
                end
         
     | 
| 
       36 
39 
     | 
    
         | 
| 
       37 
40 
     | 
    
         
             
                # Returns the parsed document title, from the content of the <title> tag.
         
     | 
    
        data/meta_inspector.gemspec
    CHANGED
    
    | 
         @@ -21,5 +21,5 @@ Gem::Specification.new do |gem| 
     | 
|
| 
       21 
21 
     | 
    
         
             
              gem.add_development_dependency 'rspec', '2.12.0'
         
     | 
| 
       22 
22 
     | 
    
         
             
              gem.add_development_dependency 'fakeweb', '1.3.0'
         
     | 
| 
       23 
23 
     | 
    
         
             
              gem.add_development_dependency 'awesome_print', '1.1.0'
         
     | 
| 
       24 
     | 
    
         
            -
              gem.add_development_dependency 'rake', '10.0. 
     | 
| 
      
 24 
     | 
    
         
            +
              gem.add_development_dependency 'rake', '~> 10.0.3'
         
     | 
| 
       25 
25 
     | 
    
         
             
            end
         
     | 
    
        data/spec/metainspector_spec.rb
    CHANGED
    
    | 
         @@ -113,6 +113,21 @@ describe MetaInspector do 
     | 
|
| 
       113 
113 
     | 
    
         
             
                end
         
     | 
| 
       114 
114 
     | 
    
         
             
              end
         
     | 
| 
       115 
115 
     | 
    
         | 
| 
      
 116 
     | 
    
         
            +
              describe 'Doing a basic scrape from passed url html' do
         
     | 
| 
      
 117 
     | 
    
         
            +
                
         
     | 
| 
      
 118 
     | 
    
         
            +
                before(:each) do
         
     | 
| 
      
 119 
     | 
    
         
            +
                  @m = MetaInspector.new("http://cnn.com", :document => "<html><head><title>Hello From Passed Html</title><a href='/hello'>Hello link</a></head><body></body></html>")
         
     | 
| 
      
 120 
     | 
    
         
            +
                end
         
     | 
| 
      
 121 
     | 
    
         
            +
             
     | 
| 
      
 122 
     | 
    
         
            +
                it "should get correct links when the url html is passed as an option" do
         
     | 
| 
      
 123 
     | 
    
         
            +
                  @m.links.should == ["http://cnn.com/hello"]
         
     | 
| 
      
 124 
     | 
    
         
            +
                end
         
     | 
| 
      
 125 
     | 
    
         
            +
             
     | 
| 
      
 126 
     | 
    
         
            +
                it "should get the title" do
         
     | 
| 
      
 127 
     | 
    
         
            +
                  @m.title.should == "Hello From Passed Html"
         
     | 
| 
      
 128 
     | 
    
         
            +
                end
         
     | 
| 
      
 129 
     | 
    
         
            +
              end
         
     | 
| 
      
 130 
     | 
    
         
            +
             
     | 
| 
       116 
131 
     | 
    
         
             
              describe 'Page with missing meta description' do
         
     | 
| 
       117 
132 
     | 
    
         
             
                it "should find secondary description" do
         
     | 
| 
       118 
133 
     | 
    
         
             
                  @m = MetaInspector.new('http://theonion-no-description.com')
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,13 +1,13 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification 
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: metainspector
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version 
         
     | 
| 
       4 
     | 
    
         
            -
              hash:  
     | 
| 
      
 4 
     | 
    
         
            +
              hash: 47
         
     | 
| 
       5 
5 
     | 
    
         
             
              prerelease: 
         
     | 
| 
       6 
6 
     | 
    
         
             
              segments: 
         
     | 
| 
       7 
7 
     | 
    
         
             
              - 1
         
     | 
| 
       8 
     | 
    
         
            -
              -  
     | 
| 
       9 
     | 
    
         
            -
              -  
     | 
| 
       10 
     | 
    
         
            -
              version: 1. 
     | 
| 
      
 8 
     | 
    
         
            +
              - 14
         
     | 
| 
      
 9 
     | 
    
         
            +
              - 0
         
     | 
| 
      
 10 
     | 
    
         
            +
              version: 1.14.0
         
     | 
| 
       11 
11 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       12 
12 
     | 
    
         
             
            authors: 
         
     | 
| 
       13 
13 
     | 
    
         
             
            - Jaime Iniesta
         
     | 
| 
         @@ -15,7 +15,7 @@ autorequire: 
     | 
|
| 
       15 
15 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       16 
16 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       17 
17 
     | 
    
         | 
| 
       18 
     | 
    
         
            -
            date:  
     | 
| 
      
 18 
     | 
    
         
            +
            date: 2013-01-14 00:00:00 Z
         
     | 
| 
       19 
19 
     | 
    
         
             
            dependencies: 
         
     | 
| 
       20 
20 
     | 
    
         
             
            - !ruby/object:Gem::Dependency 
         
     | 
| 
       21 
21 
     | 
    
         
             
              name: nokogiri
         
     | 
| 
         @@ -118,14 +118,14 @@ dependencies: 
     | 
|
| 
       118 
118 
     | 
    
         
             
              requirement: &id007 !ruby/object:Gem::Requirement 
         
     | 
| 
       119 
119 
     | 
    
         
             
                none: false
         
     | 
| 
       120 
120 
     | 
    
         
             
                requirements: 
         
     | 
| 
       121 
     | 
    
         
            -
                - -  
     | 
| 
      
 121 
     | 
    
         
            +
                - - ~>
         
     | 
| 
       122 
122 
     | 
    
         
             
                  - !ruby/object:Gem::Version 
         
     | 
| 
       123 
     | 
    
         
            -
                    hash:  
     | 
| 
      
 123 
     | 
    
         
            +
                    hash: 73
         
     | 
| 
       124 
124 
     | 
    
         
             
                    segments: 
         
     | 
| 
       125 
125 
     | 
    
         
             
                    - 10
         
     | 
| 
       126 
126 
     | 
    
         
             
                    - 0
         
     | 
| 
       127 
     | 
    
         
            -
                    -  
     | 
| 
       128 
     | 
    
         
            -
                    version: 10.0. 
     | 
| 
      
 127 
     | 
    
         
            +
                    - 3
         
     | 
| 
      
 128 
     | 
    
         
            +
                    version: 10.0.3
         
     | 
| 
       129 
129 
     | 
    
         
             
              type: :development
         
     | 
| 
       130 
130 
     | 
    
         
             
              version_requirements: *id007
         
     | 
| 
       131 
131 
     | 
    
         
             
            description: MetaInspector lets you scrape a web page and get its title, charset, link and meta tags
         
     |