region_extractor 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Andrew Carpenter
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,17 @@
1
+ = region_extractor
2
+
3
+ Description goes here.
4
+
5
+ == Note on Patches/Pull Requests
6
+
7
+ * Fork the project.
8
+ * Make your feature addition or bug fix.
9
+ * Add tests for it. This is important so I don't break it in a
10
+ future version unintentionally.
11
+ * Commit, do not mess with rakefile, version, or history.
12
+ (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
13
+ * Send me a pull request. Bonus points for topic branches.
14
+
15
+ == Copyright
16
+
17
+ Copyright (c) 2010 Andrew Carpenter. See LICENSE for details.
@@ -0,0 +1,46 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "region_extractor"
8
+ gem.summary = %Q{Extract geographic regions from plain text}
9
+ gem.description = %Q{Attempts to extract geographic regions mentioned in txt/html documents.}
10
+ gem.email = "andrew.main@gmail.com"
11
+ gem.homepage = "http://github.com/andrewcarpenter/region_extractor"
12
+ gem.authors = ["Andrew Carpenter"]
13
+ gem.add_development_dependency "rspec", ">= 1.2.9"
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ gem.add_dependency('proj4rb', '>= 0.3.1')
16
+ end
17
+ Jeweler::GemcutterTasks.new
18
+ rescue LoadError
19
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
20
+ end
21
+
22
+ require 'spec/rake/spectask'
23
+ Spec::Rake::SpecTask.new(:spec) do |spec|
24
+ spec.libs << 'lib' << 'spec'
25
+ spec.spec_files = FileList['spec/**/*_spec.rb']
26
+ end
27
+
28
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
29
+ spec.libs << 'lib' << 'spec'
30
+ spec.pattern = 'spec/**/*_spec.rb'
31
+ spec.rcov = true
32
+ end
33
+
34
+ task :spec => :check_dependencies
35
+
36
+ task :default => :spec
37
+
38
+ require 'rake/rdoctask'
39
+ Rake::RDocTask.new do |rdoc|
40
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
41
+
42
+ rdoc.rdoc_dir = 'rdoc'
43
+ rdoc.title = "region_extractor #{version}"
44
+ rdoc.rdoc_files.include('README*')
45
+ rdoc.rdoc_files.include('lib/**/*.rb')
46
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
@@ -0,0 +1,55 @@
1
+ require 'rubygems'
2
+ require 'proj4'
3
+
4
+ require "region_extractor/region"
5
+ require "region_extractor/point"
6
+
7
+ class RegionExtractor
8
+ attr_accessor :regions, :resulting_text
9
+
10
+ def initialize(text, &region_transformer)
11
+ @text = text || ''
12
+ @regions = []
13
+ @resulting_text = text
14
+ @region_transformer = region_transformer
15
+ extract!
16
+ end
17
+
18
+ private
19
+
20
+ def extract!
21
+ @resulting_text = @text.gsub(/(?:excluding land bound by )?(?:\d{6,7}\s*,\s*\d{6,7};\s*){2,}\d{6,7}\s*,\s*\d{6,7}/) do |region_string|
22
+ match = @text.match(/UTM [z|Z]one (\d+)/)
23
+ zone = match ? match[1] : ''
24
+
25
+ points = region_string.sub(/^excluding land bound by /,'').split(/\s*;\s*/).map do |c|
26
+ x , y = c.split(/\s*,\s*/)
27
+ Point.new(x.to_i,y.to_i)
28
+ end
29
+
30
+ region = Region.new(:points => points, :zone => zone)
31
+ if region_string =~ /^excluding/
32
+ prior_region = @regions.last
33
+ prior_region.add_hole(region)
34
+
35
+ region_string # don't modify the document
36
+ else
37
+ @regions << region
38
+
39
+ if @region_transformer
40
+ @region_transformer.call(region_string, region, @regions.size-1)
41
+ else
42
+ nil
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
48
+
49
+ # extractor = RegionExtractor.new("507093, 4879404; 507095, 4879401") do |region, i|
50
+ # "<div id=\"map_#{i}\"></div>"
51
+ #
52
+ # end
53
+ #
54
+ # extractor.regions
55
+ # extractor.resulting_text
@@ -0,0 +1,16 @@
1
+ class RegionExtractor
2
+ include Proj4
3
+ class Point
4
+ attr_accessor :x, :y
5
+
6
+ def initialize(x,y)
7
+ @x = x
8
+ @y = y
9
+ end
10
+
11
+ def self.tranform_point(point, origin_projection, destination_projection)
12
+ output = origin_projection.transform(destination_projection, point)
13
+ Point.new(output.x * Proj4::RAD_TO_DEG, output.y * Proj4::RAD_TO_DEG)
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,23 @@
1
+ class RegionExtractor
2
+ class Region
3
+ attr_accessor :points, :zone, :holes
4
+ def initialize(options = {})
5
+ @points = options[:points]
6
+ @zone = options[:zone]
7
+ @projection = Proj4::Projection.new( :proj => 'utm', :datum => "NAD83", :zone => @zone)
8
+ @holes = []
9
+ end
10
+
11
+ def transformed_points(output_projection = Proj4::Projection.new( :proj => 'latlong', :datum => 'WGS84' ))
12
+ @points.map{|p| Point.tranform_point(p, @projection, output_projection) }
13
+ end
14
+
15
+ def coordinates(output_projection = Proj4::Projection.new( :proj => 'latlong', :datum => 'WGS84' ))
16
+ transformed_points(output_projection).map{|p|"#{p.x},#{p.y},0.000000"}.join("\n")
17
+ end
18
+
19
+ def add_hole(region)
20
+ @holes << region
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,60 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{region_extractor}
8
+ s.version = "0.0.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Andrew Carpenter"]
12
+ s.date = %q{2010-06-12}
13
+ s.description = %q{Attempts to extract geographic regions mentioned in txt/html documents.}
14
+ s.email = %q{andrew.main@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "lib/region_extractor.rb",
27
+ "lib/region_extractor/point.rb",
28
+ "lib/region_extractor/region.rb",
29
+ "region_extractor.gemspec",
30
+ "spec/region_extractor_spec.rb",
31
+ "spec/spec.opts",
32
+ "spec/spec_helper.rb"
33
+ ]
34
+ s.homepage = %q{http://github.com/andrewcarpenter/region_extractor}
35
+ s.rdoc_options = ["--charset=UTF-8"]
36
+ s.require_paths = ["lib"]
37
+ s.rubygems_version = %q{1.3.6}
38
+ s.summary = %q{Extract geographic regions from plain text}
39
+ s.test_files = [
40
+ "spec/region_extractor_spec.rb",
41
+ "spec/spec_helper.rb"
42
+ ]
43
+
44
+ if s.respond_to? :specification_version then
45
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
46
+ s.specification_version = 3
47
+
48
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
49
+ s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
50
+ s.add_runtime_dependency(%q<proj4rb>, [">= 0.3.1"])
51
+ else
52
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
53
+ s.add_dependency(%q<proj4rb>, [">= 0.3.1"])
54
+ end
55
+ else
56
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
57
+ s.add_dependency(%q<proj4rb>, [">= 0.3.1"])
58
+ end
59
+ end
60
+
@@ -0,0 +1,94 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+ describe "RegionExtractor" do
3
+ context "parsing text" do
4
+ context "with no regions present" do
5
+ before(:each) do
6
+ @texts = [
7
+ "",
8
+ "The brown cow.",
9
+ "10/15/1982",
10
+ "Portland, OR 97266; telephone 503-231-6179; facsimile 503-231-6195",
11
+ "<p>Optimal Oregon chub habitat provides 1 square meter (11 square feet) of aquatic surface area per adult, at depths between 0.5 m (1.6 ft) to 2 m (6.6 ft) (Scheerer 2008b).</p>"
12
+ ]
13
+ end
14
+
15
+ it "returns no regions" do
16
+ @texts.each do |text|
17
+ RegionExtractor.new(text).regions.should == []
18
+ end
19
+ end
20
+
21
+ it "does not modify the text" do
22
+ @texts.each do |text|
23
+ RegionExtractor.new(text).resulting_text.should == text
24
+ end
25
+ end
26
+ end
27
+
28
+ context "with one simple region" do
29
+ before(:each) do
30
+ @extractor = RegionExtractor.new("Land bounded by the following UTM Zone 10, NAD83 coordinates (E,N): 557923, 4838857; 557919, 4838854; 557919, 4838854;")
31
+ end
32
+
33
+ it "finds the correct number of regions" do
34
+ @extractor.regions.size.should == 1
35
+ end
36
+
37
+ it "the region has the correct number of points" do
38
+ @extractor.regions.first.points.size.should == 3
39
+ end
40
+
41
+ it "should determine the correct zone" do
42
+ @extractor.regions.first.zone.should == "10"
43
+ end
44
+
45
+ end
46
+
47
+ context "with more complex regions" do
48
+ before(:each) do
49
+ @extractor = RegionExtractor.new("<p>Unit 1A consists of boundary points with the following coordinates in UTM Zone 4, with the units in meters, using North American Datum of 1983 (Nad83):</p>
50
+ <p>(A) 451377, 2420941; 451318, 2421296; 451365, 2421383; 451432, 2421109; 451596, 2421040; 451959, 2421072.</p><p>(ii) Follow the approximate coordinates: 457583, 2422071; 457631, 2422040; 457702, 2421952; 457543, 2421778; 457490, 2421812; 457400, 2421778; 457352, 2421693; 457380, 2421601.</p>")
51
+ end
52
+
53
+ it "finds the correct number of regions" do
54
+ @extractor.regions.size.should == 2
55
+ end
56
+
57
+ it "the regions have the correct number of points" do
58
+ @extractor.regions[0].points.size.should == 6
59
+ @extractor.regions[1].points.size.should == 8
60
+ end
61
+
62
+ it "should determine the correct zone" do
63
+ @extractor.regions[0].zone.should == "4"
64
+ end
65
+ end
66
+
67
+ context "with regions to exclude" do
68
+ before(:each) do
69
+ @extractor = RegionExtractor.new("<p>UTM zone 4 451377, 2420941; 451318, 2421296; 451365, 2421383; 451432, 2421109 excluding land bound by 457490, 2421812; 457400, 2421778; 457352, 2421693") do |str, region, i|
70
+ "#{str}[POST]"
71
+ end
72
+ end
73
+
74
+ it "should find the right number of regions" do
75
+ @extractor.regions.size.should == 1
76
+ end
77
+
78
+ it "should not overwrite the exclude" do
79
+ @extractor.resulting_text.should == "<p>UTM zone 4 451377, 2420941; 451318, 2421296; 451365, 2421383; 451432, 2421109[POST] excluding land bound by 457490, 2421812; 457400, 2421778; 457352, 2421693"
80
+ end
81
+ end
82
+
83
+ context "with a block" do
84
+ it "puts a replacement in for each region" do
85
+ extractor = RegionExtractor.new("UTM Zone 10 557923, 4838857; 557919, 4838854; 557919, 4838854. Then 457631, 2421540; 457678, 2421675; 457766, 2421821; 457637, 2421453.") do |str, region, i|
86
+ "#{str}[i=#{i}; points=#{region.points.size}]"
87
+ end
88
+ extractor.resulting_text.should == "UTM Zone 10 557923, 4838857; 557919, 4838854; 557919, 4838854[i=0; points=3]. Then 457631, 2421540; 457678, 2421675; 457766, 2421821; 457637, 2421453[i=1; points=4]."
89
+ end
90
+ end
91
+ end
92
+ end
93
+
94
+ # (ii) Starting on the coastline at approximately coordinates of: 458997, 2422152; follow: 458345, 2422341; 458686, 2422405; 458786, 2422373;458934, 2422253; 459001, 2422151; 458997, 2422152; 457589, 2420990; 457575, 2420975; 457511, 2420984; 457631, 2421127; 457738, 2421168; 457900, 2421206; 458023, 2421343; 458023, 2421417; 457895, 2421435; 457803, 2421394; 457686, 2421405; 457637, 2421453; 457631, 2421540; 457678, 2421675; 457766, 2421821; 457908, 2421944; 458069, 2421867; 458216, 2421849; 458244, 2421886; 458253, 2421996; 458235, 2422079; 458299, 2422272; 458345, 2422341; 457589, 2420990; to approximately: 457590, 2420991 (coastline); follow coastline to the approximate coordinates of: 458494, 2421794; then follow: 458494, 2421795; 458495, 2421795; 458502, 2421802, 458492, 2421904; 458483, 2421987; 458566, 2422060; 458559, 2422190; 458630, 2422263; 458718, 2422262; 458805, 2422159; 458777, 2422115; 458686, 2422119; 458658, 2422060; 458667, 2421987; 458702, 2421920; to the coastline, approximately at: 458702, 2421919; follow coastline to beginning point: 458997, 2422152.
@@ -0,0 +1 @@
1
+ --color
@@ -0,0 +1,9 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'region_extractor'
4
+ require 'spec'
5
+ require 'spec/autorun'
6
+
7
+ Spec::Runner.configure do |config|
8
+
9
+ end
metadata ADDED
@@ -0,0 +1,103 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: region_extractor
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: ruby
11
+ authors:
12
+ - Andrew Carpenter
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-06-12 00:00:00 -07:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: rspec
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 1
29
+ - 2
30
+ - 9
31
+ version: 1.2.9
32
+ type: :development
33
+ version_requirements: *id001
34
+ - !ruby/object:Gem::Dependency
35
+ name: proj4rb
36
+ prerelease: false
37
+ requirement: &id002 !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ segments:
42
+ - 0
43
+ - 3
44
+ - 1
45
+ version: 0.3.1
46
+ type: :runtime
47
+ version_requirements: *id002
48
+ description: Attempts to extract geographic regions mentioned in txt/html documents.
49
+ email: andrew.main@gmail.com
50
+ executables: []
51
+
52
+ extensions: []
53
+
54
+ extra_rdoc_files:
55
+ - LICENSE
56
+ - README.rdoc
57
+ files:
58
+ - .document
59
+ - .gitignore
60
+ - LICENSE
61
+ - README.rdoc
62
+ - Rakefile
63
+ - VERSION
64
+ - lib/region_extractor.rb
65
+ - lib/region_extractor/point.rb
66
+ - lib/region_extractor/region.rb
67
+ - region_extractor.gemspec
68
+ - spec/region_extractor_spec.rb
69
+ - spec/spec.opts
70
+ - spec/spec_helper.rb
71
+ has_rdoc: true
72
+ homepage: http://github.com/andrewcarpenter/region_extractor
73
+ licenses: []
74
+
75
+ post_install_message:
76
+ rdoc_options:
77
+ - --charset=UTF-8
78
+ require_paths:
79
+ - lib
80
+ required_ruby_version: !ruby/object:Gem::Requirement
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ segments:
85
+ - 0
86
+ version: "0"
87
+ required_rubygems_version: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - ">="
90
+ - !ruby/object:Gem::Version
91
+ segments:
92
+ - 0
93
+ version: "0"
94
+ requirements: []
95
+
96
+ rubyforge_project:
97
+ rubygems_version: 1.3.6
98
+ signing_key:
99
+ specification_version: 3
100
+ summary: Extract geographic regions from plain text
101
+ test_files:
102
+ - spec/region_extractor_spec.rb
103
+ - spec/spec_helper.rb