url_extractor 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 12e9a39d947f5860ec829d6f3a0c2a2c8829b0b3
4
+ data.tar.gz: a37afb38367c298689aa194d0945f8c0f82e1318
5
+ SHA512:
6
+ metadata.gz: cc38c7af36334af7f4a23b5f63c0af6bf6f658c070f21a35b370d0c5e5a25942861344c61a74fe85dd02d6cbd78636ff45e9ce488752509ea56d38782243f08f
7
+ data.tar.gz: f8abdf7eae0aa71e35880af91db019970faf3081a3f8363c70a835b9996fb34cef95cfdaef4d0c71852f0c2c0d2e58f3ce3352f90dff8d0abe55aecfe7d6bb0a
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm use 2.1.0@url_extractor --create
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in url_extractor.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Michael Guterl
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # UrlExtractor
2
+
3
+ A tool for extracting and replacing URLs from inside a block of text or HTML.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'url_extractor'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install url_extractor
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
@@ -0,0 +1,6 @@
1
+ module UrlExtractor
2
+ autoload :VERSION, 'url_extractor/version'
3
+ autoload :Util, 'url_extractor/util'
4
+ autoload :PlainText, 'url_extractor/plain_text'
5
+ autoload :HTML, 'url_extractor/html'
6
+ end
@@ -0,0 +1,33 @@
1
+ require 'nokogiri'
2
+
3
+ module UrlExtractor
4
+ class HTML
5
+
6
+ InvalidURL = Class.new(StandardError)
7
+
8
+ def initialize(ignore_patterns = [])
9
+ @ignore_patterns = ignore_patterns
10
+ end
11
+
12
+ def replace_urls(template)
13
+ document = Nokogiri::HTML(template)
14
+
15
+ document.search("a").each do |link|
16
+ url = link['href']
17
+ raise InvalidURL, "cannot be blank" if UrlExtractor::Util.blank_string?(url)
18
+ next if ignored_url?(url)
19
+
20
+ link['href'] = yield(url)
21
+ end
22
+
23
+ document.to_s
24
+ end
25
+
26
+ private
27
+
28
+ def ignored_url?(url)
29
+ url !~ /https?:\/\// || @ignore_patterns.any? { |pattern| url =~ pattern }
30
+ end
31
+
32
+ end
33
+ end
@@ -0,0 +1,29 @@
1
+ require 'uri'
2
+
3
+ module UrlExtractor
4
+ class PlainText
5
+
6
+ def initialize(ignore_patterns = [])
7
+ @ignore_patterns = ignore_patterns
8
+ end
9
+
10
+ def replace_urls(text)
11
+ urls = URI.extract text
12
+
13
+ urls.each do |url|
14
+ next if ignogred_url?(url)
15
+
16
+ text = text.gsub url, yield(url)
17
+ end
18
+
19
+ text
20
+ end
21
+
22
+ private
23
+
24
+ def ignogred_url?(url)
25
+ url !~ /https?:\/\// || @ignore_patterns.any? { |pattern| url =~ pattern }
26
+ end
27
+
28
+ end
29
+ end
@@ -0,0 +1,13 @@
1
+ module UrlExtractor
2
+ module Util
3
+
4
+ extend self
5
+
6
+ NON_WHITESPACE_REGEXP = %r![^\s#{[0x3000].pack("U")}]!
7
+
8
+ def blank_string?(string)
9
+ string !~ NON_WHITESPACE_REGEXP
10
+ end
11
+
12
+ end
13
+ end
@@ -0,0 +1,3 @@
1
+ module UrlExtractor
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,41 @@
1
+ require 'url_extractor'
2
+
3
+ describe UrlExtractor::HTML do
4
+ def html_wrapped_string(string)
5
+ "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">\n<html><body>#{string}\n</body></html>\n"
6
+ end
7
+
8
+ it 'returns a string with the modified URLs that were returned in the block' do
9
+ extractor = UrlExtractor::HTML.new
10
+ text = html_wrapped_string('foo bar <a href="http://google.com">Google</a>')
11
+
12
+ actual = extractor.replace_urls(text) do |extracted|
13
+ 'http://foo.bar'
14
+ end
15
+
16
+ actual.should == html_wrapped_string("foo bar <a href=\"http://foo.bar\">Google</a>")
17
+ end
18
+
19
+ it 'ignores urls that match patterns provided' do
20
+ extractor = UrlExtractor::HTML.new([/google/])
21
+ text = html_wrapped_string('foo bar <a href="http://google.com">Stuff</a> <a href="http://bar.baz">Google</a>')
22
+
23
+ actual = extractor.replace_urls(text) do |extracted|
24
+ 'http://foo.bar'
25
+ end
26
+
27
+ actual.should == html_wrapped_string("foo bar <a href=\"http://google.com\">Stuff</a> <a href=\"http://foo.bar\">Google</a>")
28
+ end
29
+
30
+ it "ignores URLs that don't match http or https" do
31
+ extractor = UrlExtractor::HTML.new
32
+ text = html_wrapped_string("foo bar <a href=\"Reminder:foobar\">Hi</a>")
33
+
34
+ actual = extractor.replace_urls(text) do |extracted|
35
+ 'http://foo.bar'
36
+ end
37
+
38
+ actual.should == text
39
+ end
40
+
41
+ end
@@ -0,0 +1,38 @@
1
+ require 'url_extractor'
2
+
3
+ describe UrlExtractor::PlainText do
4
+
5
+ it 'returns a string with the modified URLs that were returned in the block' do
6
+ extractor = UrlExtractor::PlainText.new
7
+ text = 'foo bar http://google.com'
8
+
9
+ actual = extractor.replace_urls(text) do |extracted|
10
+ 'http://foo.bar'
11
+ end
12
+
13
+ actual.should == 'foo bar http://foo.bar'
14
+ end
15
+
16
+ it 'ignores urls that match patterns provided' do
17
+ extractor = UrlExtractor::PlainText.new([/google/])
18
+ text = 'http://google.com http://foobar.com'
19
+
20
+ actual = extractor.replace_urls(text) do |extracted|
21
+ 'http://foo.bar'
22
+ end
23
+
24
+ actual.should == 'http://google.com http://foo.bar'
25
+ end
26
+
27
+ it "ignores URLs that don't match http or https" do
28
+ extractor = UrlExtractor::PlainText.new
29
+ text = "Reminder: Blah"
30
+
31
+ actual = extractor.replace_urls(text) do |extracted|
32
+ 'http://foo.bar'
33
+ end
34
+
35
+ actual.should == 'Reminder: Blah'
36
+ end
37
+
38
+ end
@@ -0,0 +1,20 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/url_extractor/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Michael Guterl"]
6
+ gem.email = ["michael@diminishing.org"]
7
+ gem.description = %q{A tool for extracting and replacing URLs from inside a block of text or HTML.}
8
+ gem.summary = %q{A tool for extracting and replacing URLs from inside a block of text or HTML.}
9
+ gem.homepage = ""
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.name = "url_extractor"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = UrlExtractor::VERSION
17
+
18
+ gem.add_development_dependency 'rspec'
19
+ gem.add_dependency 'nokogiri'
20
+ end
metadata ADDED
@@ -0,0 +1,88 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: url_extractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Michael Guterl
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-04-21 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: A tool for extracting and replacing URLs from inside a block of text
42
+ or HTML.
43
+ email:
44
+ - michael@diminishing.org
45
+ executables: []
46
+ extensions: []
47
+ extra_rdoc_files: []
48
+ files:
49
+ - ".gitignore"
50
+ - ".rvmrc"
51
+ - Gemfile
52
+ - LICENSE
53
+ - README.md
54
+ - Rakefile
55
+ - lib/url_extractor.rb
56
+ - lib/url_extractor/html.rb
57
+ - lib/url_extractor/plain_text.rb
58
+ - lib/url_extractor/util.rb
59
+ - lib/url_extractor/version.rb
60
+ - spec/url_extractor/html_spec.rb
61
+ - spec/url_extractor/plain_text_spec.rb
62
+ - url_extractor.gemspec
63
+ homepage: ''
64
+ licenses: []
65
+ metadata: {}
66
+ post_install_message:
67
+ rdoc_options: []
68
+ require_paths:
69
+ - lib
70
+ required_ruby_version: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ required_rubygems_version: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: '0'
80
+ requirements: []
81
+ rubyforge_project:
82
+ rubygems_version: 2.2.2
83
+ signing_key:
84
+ specification_version: 4
85
+ summary: A tool for extracting and replacing URLs from inside a block of text or HTML.
86
+ test_files:
87
+ - spec/url_extractor/html_spec.rb
88
+ - spec/url_extractor/plain_text_spec.rb