region_extractor 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Andrew Carpenter
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,17 @@
1
+ = region_extractor
2
+
3
+ Description goes here.
4
+
5
+ == Note on Patches/Pull Requests
6
+
7
+ * Fork the project.
8
+ * Make your feature addition or bug fix.
9
+ * Add tests for it. This is important so I don't break it in a
10
+ future version unintentionally.
11
+ * Commit, do not mess with rakefile, version, or history.
12
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
13
+ * Send me a pull request. Bonus points for topic branches.
14
+
15
+ == Copyright
16
+
17
+ Copyright (c) 2010 Andrew Carpenter. See LICENSE for details.
@@ -0,0 +1,46 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "region_extractor"
8
+ gem.summary = %Q{Extract geographic regions from plain text}
9
+ gem.description = %Q{Attempts to extract geographic regions mentioned in txt/html documents.}
10
+ gem.email = "andrew.main@gmail.com"
11
+ gem.homepage = "http://github.com/andrewcarpenter/region_extractor"
12
+ gem.authors = ["Andrew Carpenter"]
13
+ gem.add_development_dependency "rspec", ">= 1.2.9"
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ gem.add_dependency('proj4rb', '>= 0.3.1')
16
+ end
17
+ Jeweler::GemcutterTasks.new
18
+ rescue LoadError
19
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
20
+ end
21
+
22
+ require 'spec/rake/spectask'
23
+ Spec::Rake::SpecTask.new(:spec) do |spec|
24
+ spec.libs << 'lib' << 'spec'
25
+ spec.spec_files = FileList['spec/**/*_spec.rb']
26
+ end
27
+
28
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
29
+ spec.libs << 'lib' << 'spec'
30
+ spec.pattern = 'spec/**/*_spec.rb'
31
+ spec.rcov = true
32
+ end
33
+
34
+ task :spec => :check_dependencies
35
+
36
+ task :default => :spec
37
+
38
+ require 'rake/rdoctask'
39
+ Rake::RDocTask.new do |rdoc|
40
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
41
+
42
+ rdoc.rdoc_dir = 'rdoc'
43
+ rdoc.title = "region_extractor #{version}"
44
+ rdoc.rdoc_files.include('README*')
45
+ rdoc.rdoc_files.include('lib/**/*.rb')
46
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
@@ -0,0 +1,55 @@
1
+ require 'rubygems'
2
+ require 'proj4'
3
+
4
+ require "region_extractor/region"
5
+ require "region_extractor/point"
6
+
7
+ class RegionExtractor
8
+ attr_accessor :regions, :resulting_text
9
+
10
+ def initialize(text, &region_transformer)
11
+ @text = text || ''
12
+ @regions = []
13
+ @resulting_text = text
14
+ @region_transformer = region_transformer
15
+ extract!
16
+ end
17
+
18
+ private
19
+
20
+ def extract!
21
+ @resulting_text = @text.gsub(/(?:excluding land bound by )?(?:\d{6,7}\s*,\s*\d{6,7};\s*){2,}\d{6,7}\s*,\s*\d{6,7}/) do |region_string|
22
+ match = @text.match(/UTM [z|Z]one (\d+)/)
23
+ zone = match ? match[1] : ''
24
+
25
+ points = region_string.sub(/^excluding land bound by /,'').split(/\s*;\s*/).map do |c|
26
+ x , y = c.split(/\s*,\s*/)
27
+ Point.new(x.to_i,y.to_i)
28
+ end
29
+
30
+ region = Region.new(:points => points, :zone => zone)
31
+ if region_string =~ /^excluding/
32
+ prior_region = @regions.last
33
+ prior_region.add_hole(region)
34
+
35
+ region_string # don't modify the document
36
+ else
37
+ @regions << region
38
+
39
+ if @region_transformer
40
+ @region_transformer.call(region_string, region, @regions.size-1)
41
+ else
42
+ nil
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
48
+
49
+ # extractor = RegionExtractor.new("507093, 4879404; 507095, 4879401") do |region, i|
50
+ # "<div id=\"map_#{i}\"></div>"
51
+ #
52
+ # end
53
+ #
54
+ # extractor.regions
55
+ # extractor.resulting_text
@@ -0,0 +1,16 @@
1
+ class RegionExtractor
2
+ include Proj4
3
+ class Point
4
+ attr_accessor :x, :y
5
+
6
+ def initialize(x,y)
7
+ @x = x
8
+ @y = y
9
+ end
10
+
11
+ def self.tranform_point(point, origin_projection, destination_projection)
12
+ output = origin_projection.transform(destination_projection, point)
13
+ Point.new(output.x * Proj4::RAD_TO_DEG, output.y * Proj4::RAD_TO_DEG)
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,23 @@
1
+ class RegionExtractor
2
+ class Region
3
+ attr_accessor :points, :zone, :holes
4
+ def initialize(options = {})
5
+ @points = options[:points]
6
+ @zone = options[:zone]
7
+ @projection = Proj4::Projection.new( :proj => 'utm', :datum => "NAD83", :zone => @zone)
8
+ @holes = []
9
+ end
10
+
11
+ def transformed_points(output_projection = Proj4::Projection.new( :proj => 'latlong', :datum => 'WGS84' ))
12
+ @points.map{|p| Point.tranform_point(p, @projection, output_projection) }
13
+ end
14
+
15
+ def coordinates(output_projection = Proj4::Projection.new( :proj => 'latlong', :datum => 'WGS84' ))
16
+ transformed_points(output_projection).map{|p|"#{p.x},#{p.y},0.000000"}.join("\n")
17
+ end
18
+
19
+ def add_hole(region)
20
+ @holes << region
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,60 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{region_extractor}
8
+ s.version = "0.0.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Andrew Carpenter"]
12
+ s.date = %q{2010-06-12}
13
+ s.description = %q{Attempts to extract geographic regions mentioned in txt/html documents.}
14
+ s.email = %q{andrew.main@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "lib/region_extractor.rb",
27
+ "lib/region_extractor/point.rb",
28
+ "lib/region_extractor/region.rb",
29
+ "region_extractor.gemspec",
30
+ "spec/region_extractor_spec.rb",
31
+ "spec/spec.opts",
32
+ "spec/spec_helper.rb"
33
+ ]
34
+ s.homepage = %q{http://github.com/andrewcarpenter/region_extractor}
35
+ s.rdoc_options = ["--charset=UTF-8"]
36
+ s.require_paths = ["lib"]
37
+ s.rubygems_version = %q{1.3.6}
38
+ s.summary = %q{Extract geographic regions from plain text}
39
+ s.test_files = [
40
+ "spec/region_extractor_spec.rb",
41
+ "spec/spec_helper.rb"
42
+ ]
43
+
44
+ if s.respond_to? :specification_version then
45
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
46
+ s.specification_version = 3
47
+
48
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
49
+ s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
50
+ s.add_runtime_dependency(%q<proj4rb>, [">= 0.3.1"])
51
+ else
52
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
53
+ s.add_dependency(%q<proj4rb>, [">= 0.3.1"])
54
+ end
55
+ else
56
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
57
+ s.add_dependency(%q<proj4rb>, [">= 0.3.1"])
58
+ end
59
+ end
60
+
@@ -0,0 +1,94 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+ describe "RegionExtractor" do
3
+ context "parsing text" do
4
+ context "with no regions present" do
5
+ before(:each) do
6
+ @texts = [
7
+ "",
8
+ "The brown cow.",
9
+ "10/15/1982",
10
+ "Portland, OR 97266; telephone 503-231-6179; facsimile 503-231-6195",
11
+ "<p>Optimal Oregon chub habitat provides 1 square meter (11 square feet) of aquatic surface area per adult, at depths between 0.5 m (1.6 ft) to 2 m (6.6 ft) (Scheerer 2008b).</p>"
12
+ ]
13
+ end
14
+
15
+ it "returns no regions" do
16
+ @texts.each do |text|
17
+ RegionExtractor.new(text).regions.should == []
18
+ end
19
+ end
20
+
21
+ it "does not modify the text" do
22
+ @texts.each do |text|
23
+ RegionExtractor.new(text).resulting_text.should == text
24
+ end
25
+ end
26
+ end
27
+
28
+ context "with one simple region" do
29
+ before(:each) do
30
+ @extractor = RegionExtractor.new("Land bounded by the following UTM Zone 10, NAD83 coordinates (E,N): 557923, 4838857; 557919, 4838854; 557919, 4838854;")
31
+ end
32
+
33
+ it "finds the correct number of regions" do
34
+ @extractor.regions.size.should == 1
35
+ end
36
+
37
+ it "the region has the correct number of points" do
38
+ @extractor.regions.first.points.size.should == 3
39
+ end
40
+
41
+ it "should determine the correct zone" do
42
+ @extractor.regions.first.zone.should == "10"
43
+ end
44
+
45
+ end
46
+
47
+ context "with more complex regions" do
48
+ before(:each) do
49
+ @extractor = RegionExtractor.new("<p>Unit 1A consists of boundary points with the following coordinates in UTM Zone 4, with the units in meters, using North American Datum of 1983 (Nad83):</p>
50
+ <p>(A) 451377, 2420941; 451318, 2421296; 451365, 2421383; 451432, 2421109; 451596, 2421040; 451959, 2421072.</p><p>(ii) Follow the approximate coordinates: 457583, 2422071; 457631, 2422040; 457702, 2421952; 457543, 2421778; 457490, 2421812; 457400, 2421778; 457352, 2421693; 457380, 2421601.</p>")
51
+ end
52
+
53
+ it "finds the correct number of regions" do
54
+ @extractor.regions.size.should == 2
55
+ end
56
+
57
+ it "the regions have the correct number of points" do
58
+ @extractor.regions[0].points.size.should == 6
59
+ @extractor.regions[1].points.size.should == 8
60
+ end
61
+
62
+ it "should determine the correct zone" do
63
+ @extractor.regions[0].zone.should == "4"
64
+ end
65
+ end
66
+
67
+ context "with regions to exclude" do
68
+ before(:each) do
69
+ @extractor = RegionExtractor.new("<p>UTM zone 4 451377, 2420941; 451318, 2421296; 451365, 2421383; 451432, 2421109 excluding land bound by 457490, 2421812; 457400, 2421778; 457352, 2421693") do |str, region, i|
70
+ "#{str}[POST]"
71
+ end
72
+ end
73
+
74
+ it "should find the right number of regions" do
75
+ @extractor.regions.size.should == 1
76
+ end
77
+
78
+ it "should not overwrite the exclude" do
79
+ @extractor.resulting_text.should == "<p>UTM zone 4 451377, 2420941; 451318, 2421296; 451365, 2421383; 451432, 2421109[POST] excluding land bound by 457490, 2421812; 457400, 2421778; 457352, 2421693"
80
+ end
81
+ end
82
+
83
+ context "with a block" do
84
+ it "puts a replacement in for each region" do
85
+ extractor = RegionExtractor.new("UTM Zone 10 557923, 4838857; 557919, 4838854; 557919, 4838854. Then 457631, 2421540; 457678, 2421675; 457766, 2421821; 457637, 2421453.") do |str, region, i|
86
+ "#{str}[i=#{i}; points=#{region.points.size}]"
87
+ end
88
+ extractor.resulting_text.should == "UTM Zone 10 557923, 4838857; 557919, 4838854; 557919, 4838854[i=0; points=3]. Then 457631, 2421540; 457678, 2421675; 457766, 2421821; 457637, 2421453[i=1; points=4]."
89
+ end
90
+ end
91
+ end
92
+ end
93
+
94
+ # (ii) Starting on the coastline at approximately coordinates of: 458997, 2422152; follow: 458345, 2422341; 458686, 2422405; 458786, 2422373;458934, 2422253; 459001, 2422151; 458997, 2422152; 457589, 2420990; 457575, 2420975; 457511, 2420984; 457631, 2421127; 457738, 2421168; 457900, 2421206; 458023, 2421343; 458023, 2421417; 457895, 2421435; 457803, 2421394; 457686, 2421405; 457637, 2421453; 457631, 2421540; 457678, 2421675; 457766, 2421821; 457908, 2421944; 458069, 2421867; 458216, 2421849; 458244, 2421886; 458253, 2421996; 458235, 2422079; 458299, 2422272; 458345, 2422341; 457589, 2420990; to approximately: 457590, 2420991 (coastline); follow coastline to the approximate coordinates of: 458494, 2421794; then follow: 458494, 2421795; 458495, 2421795; 458502, 2421802, 458492, 2421904; 458483, 2421987; 458566, 2422060; 458559, 2422190; 458630, 2422263; 458718, 2422262; 458805, 2422159; 458777, 2422115; 458686, 2422119; 458658, 2422060; 458667, 2421987; 458702, 2421920; to the coastline, approximately at: 458702, 2421919; follow coastline to beginning point: 458997, 2422152.
@@ -0,0 +1 @@
1
+ --color
@@ -0,0 +1,9 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'region_extractor'
4
+ require 'spec'
5
+ require 'spec/autorun'
6
+
7
+ Spec::Runner.configure do |config|
8
+
9
+ end
metadata ADDED
@@ -0,0 +1,103 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: region_extractor
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: ruby
11
+ authors:
12
+ - Andrew Carpenter
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-06-12 00:00:00 -07:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: rspec
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 1
29
+ - 2
30
+ - 9
31
+ version: 1.2.9
32
+ type: :development
33
+ version_requirements: *id001
34
+ - !ruby/object:Gem::Dependency
35
+ name: proj4rb
36
+ prerelease: false
37
+ requirement: &id002 !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ segments:
42
+ - 0
43
+ - 3
44
+ - 1
45
+ version: 0.3.1
46
+ type: :runtime
47
+ version_requirements: *id002
48
+ description: Attempts to extract geographic regions mentioned in txt/html documents.
49
+ email: andrew.main@gmail.com
50
+ executables: []
51
+
52
+ extensions: []
53
+
54
+ extra_rdoc_files:
55
+ - LICENSE
56
+ - README.rdoc
57
+ files:
58
+ - .document
59
+ - .gitignore
60
+ - LICENSE
61
+ - README.rdoc
62
+ - Rakefile
63
+ - VERSION
64
+ - lib/region_extractor.rb
65
+ - lib/region_extractor/point.rb
66
+ - lib/region_extractor/region.rb
67
+ - region_extractor.gemspec
68
+ - spec/region_extractor_spec.rb
69
+ - spec/spec.opts
70
+ - spec/spec_helper.rb
71
+ has_rdoc: true
72
+ homepage: http://github.com/andrewcarpenter/region_extractor
73
+ licenses: []
74
+
75
+ post_install_message:
76
+ rdoc_options:
77
+ - --charset=UTF-8
78
+ require_paths:
79
+ - lib
80
+ required_ruby_version: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ segments:
85
+ - 0
86
+ version: "0"
87
+ required_rubygems_version: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - ">="
90
+ - !ruby/object:Gem::Version
91
+ segments:
92
+ - 0
93
+ version: "0"
94
+ requirements: []
95
+
96
+ rubyforge_project:
97
+ rubygems_version: 1.3.6
98
+ signing_key:
99
+ specification_version: 3
100
+ summary: Extract geographic regions from plain text
101
+ test_files:
102
+ - spec/region_extractor_spec.rb
103
+ - spec/spec_helper.rb