RubyGems - engtagger - Versions diffs - 0.1.0 - Mend

engtagger 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

data/History.txt +4 -0
data/LICENSE.txt +340 -0
data/Manifest.txt +13 -0
data/README.txt +70 -0
data/Rakefile +24 -0
data/lib/engtagger.rb +729 -0
data/lib/engtagger/porter.rb +196 -0
data/lib/engtagger/pos_tags.hash +0 -0
data/lib/engtagger/pos_words.hash +4467 -0
data/lib/engtagger/tags.yml +45 -0
data/lib/engtagger/unknown.yml +12 -0
data/lib/engtagger/words.yml +43818 -0
data/test/test_engtagger.rb +196 -0
metadata +86 -0

data/test/test_engtagger.rb ADDED Viewed

@@ -0,0 +1,196 @@
+# Code Generated by ZenTest v. 3.9.2
+#                 classname: asrt / meth =  ratio%
+#                    EngTagger:    0 /   24 =   0.00%
+$ENGTAGGER_LIB = File.join(File.dirname(__FILE__), '..', 'lib')
+$LOAD_PATH << $ENGTAGGER_LIB
+require 'test/unit' unless defined? $ZENTEST and $ZENTEST
+require 'engtagger'
+class TestEngTagger < Test::Unit::TestCase
+  @@untagged =<<EOD
+Lisa Raines, a lawyer and director of government relations for the Industrial Biotechnical Association, contends that a judge well-versed in patent law and the concerns of research-based industries would have ruled otherwise. And Judge Newman, a former patent lawyer, wrote in her dissent when the court denied a motion for a rehearing of the case by the full court, "The panel's judicial legislation has affected an important high-technological industry, without regard to the consequences for research and innovation or the public interest." Says Ms. Raines, "[The judgement] confirms our concern that the absence of patent lawyers on the court could prove troublesome."
+EOD
+  @@tagged =<<EOD
+<nnp>Lisa</nnp> <nnp>Raines</nnp> <ppc>,</ppc> <det>a</det> <nn>lawyer</nn> <cc>and</cc> <nn>director</nn> <in>of</in> <nn>government</nn> <nns>relations</nns> <in>for</in> <det>the</det> <nnp>Industrial</nnp> <nnp>Biotechnical</nnp> <nnp>Association</nnp> <ppc>,</ppc> <vbz>contends</vbz> <in>that</in> <det>a</det> <nn>judge</nn> <jj>well-versed</jj> <in>in</in> <nn>patent</nn> <nn>law</nn> <cc>and</cc> <det>the</det> <nns>concerns</nns> <in>of</in> <jj>research-based</jj> <nns>industries</nns> <md>would</md> <vb>have</vb> <vbn>ruled</vbn> <rb>otherwise</rb> <pp>.</pp>
+EOD
+  def setup
+    @tagger = EngTagger.new
+    tagpath = File.join($ENGTAGGER_LIB, @tagger.conf[:tag_path])
+    wordpath = File.join($ENGTAGGER_LIB, @tagger.conf[:word_path])
+    if !File.exists?(tagpath) or !File.exists?(wordpath)
+      @tagger.install
+    end
+  end
+  def text_get_ext
+    model = '<cd>[^<]+</cd}>\s*'
+    assert_equal(model, EngTagger.get_ext(model, "cd"))
+  end
+  def test_explain_tag
+    assert_equal("noun", EngTagger.explain_tag("nn"))
+    assert_equal("verb_infinitive", EngTagger.explain_tag("vb"))
+  end
+  def test_add_tags
+    assert_instance_of(String, @tagger.add_tags(@@untagged))
+  end
+  def test_assign_tag
+    models = []; tests = []
+    models += [@tagger.conf[:unknown_word_tag], "sym"]
+    tests += [["pp","-unknown-"], ["pp", "-sym-"]]
+    models.length.times do |i|
+      assert_equal(models[i],@tagger.assign_tag(*tests[i]))
+    end
+    tests = []
+    tests += [["vb","water"], ["nn", "runs"]]
+    models.length.times do |i|
+      result = @tagger.assign_tag(*tests[i])
+      assert(EngTagger.hmm.keys.index(result))
+    end
+  end
+  def test_classify_unknown_word
+    assert_equal("*LRB*", @tagger.classify_unknown_word("{"))
+    assert_equal("*NUM*", @tagger.classify_unknown_word("123.4567"))
+    assert_equal("*ORD*", @tagger.classify_unknown_word("40th"))
+    assert_equal("-abr-", @tagger.classify_unknown_word("GT-R"))
+    assert_equal("-hyp-adj-", @tagger.classify_unknown_word("extremely-high"))
+    assert_equal("-sym-", @tagger.classify_unknown_word("&&"))
+    assert_equal("-ing-", @tagger.classify_unknown_word("wikiing"))
+    assert_equal("-unknown-", @tagger.classify_unknown_word("asefasdf"))
+  end
+  def test_clean_text
+    test = "<html><body>I am <b>100% sure</b> that Dr. Watson is too naive. I'm sorry.</body></html>"
+    model = ["I","am","100","%","sure","that","Dr.","Watson","is","too","naive",".","I","'m","sorry","."]
+    assert_equal(model, @tagger.clean_text(test))
+  end
+  def test_clean_word
+    models = []; tests = []
+    models += ["*NUM*"]
+    models += ["Plays"]
+    models += ["pleadingly"]
+    tests += ["1973.0820", "Plays", "Pleadingly"]
+    models.length.times do |i|
+      assert_equal(models[i], @tagger.clean_word(tests[i]))
+    end
+  end
+  def test_get_max_noun_phrases
+    result = @tagger.get_max_noun_phrases(@@tagged)
+    assert_instance_of(Hash, result)
+  end
+  def test_get_max_noun_regex
+    assert_instance_of(Regexp, @tagger.get_max_noun_regex)
+  end
+  def test_get_noun_phrases
+    result = @tagger.get_noun_phrases(@@tagged)
+    assert_instance_of(Hash, result)
+  end
+  def test_get_nouns
+    result = @tagger.get_nouns(@@tagged)
+    assert_instance_of(Hash, result)
+  end
+  def test_get_proper_nouns
+    test = "<nnp>BBC</nnp> <vbz>means</vbz> <nnp>British Broadcasting Corporation</nnp> <pp>.</pp>"
+    result = @tagger.get_proper_nouns(test)
+    assert_instance_of(Hash, result)
+  end
+  def test_get_readable
+    test = "I woke up to the sound of pouring rain."
+    result = @tagger.get_readable(test)
+    assert(String, result)
+  end
+  def test_get_sentences
+    result = @tagger.get_sentences(@@untagged)
+    assert_equal(4, result.length)
+  end
+  def test_get_words
+    @tagger.conf[:longest_noun_phrase] = 1
+    result1 = @tagger.get_words(@@tagged)
+    @tagger.conf[:longest_noun_phrase] = 10
+    result2 = @tagger.get_words(@@tagged)
+    assert_instance_of(Hash, result1)
+    assert_instance_of(Hash, result2)
+  end
+  def test_reset
+    @tagger.conf[:current_tag] = 'nn'
+    @tagger.reset
+    assert_equal('pp', @tagger.conf[:current_tag])
+  end
+  def test_split_punct
+    models = []; texts = []
+    models << ["`", "test"]; texts <<  "`test"
+    models << ["``", "test"]; texts <<  "\"test"
+    models << ["`", "test"]; texts <<  "'test"
+    models << ["''"]; texts <<  '"'
+    models << ["test", "'"]; texts <<  "test' "
+    models << ["-", "test", "-"]; texts << "---test-----"
+    models << ["test", ",", "test"]; texts <<  "test,test"
+    models << ["123,456"]; texts <<  "123,456"
+    models << ["test", ":"]; texts <<  "test:"
+    models << ["test1", "...", "test2"]; texts <<  "test1...test2"
+    models << ["{", "ab","[","(","c",")","[","d","]","]","}"]; texts <<  "{ab[(c)[d]]}"
+    models << ["test", "#", "test"]; texts <<  "test#test"
+    models << ["I", "'d", "like"]; texts <<  "I'd like"
+    models << ["is", "n't", "so"]; texts <<  "isn't so"
+    models << ["we", "'re", "all"]; texts <<  "we're all"
+    texts.each_with_index do |text, index|
+      assert_equal(models[index], @tagger.split_punct(text))
+    end
+  end
+  def test_split_sentences
+    models = []; tests = []
+    models << ["He", "is", "a", "u.s.", "army", "officer", "."]
+    tests << ["He", "is", "a", "u.s.", "army", "officer."]
+    models << ["He", "is", "Mr.", "Johnson", ".", "He", "'s", "my", "friend", "."]
+    tests << ["He", "is", "Mr.", "Johnson.", "He", "'s", "my", "friend."]
+    models.length.times do |i|
+      assert_equal(models[i], @tagger.split_sentences(tests[i]))
+    end
+  end
+  def test_stem
+    word = "gets"
+    old = @tagger.conf[:stem]
+    @tagger.conf[:stem] = true
+    assert_equal("get", @tagger.stem(word))
+    # the following should not work since we memoize stem method
+    # @tagger.conf[:stem] = false
+    # assert_equal("gets", @tagger.stem(word))
+    @tagger.conf[:stem] = old
+  end
+  def test_strip_tags
+    assert_instance_of(String, @tagger.strip_tags(@@tagged))
+  end
+  def test_valid_text
+    text = nil
+    assert(!@tagger.valid_text(text))
+    text = "this is test text"
+    assert(@tagger.valid_text(text))
+    text = ""
+    assert(!@tagger.valid_text(text))
+  end
+end
+# Number of errors detected: 24

metadata ADDED Viewed

@@ -0,0 +1,86 @@
+--- !ruby/object:Gem::Specification
+name: engtagger
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Yoichiro Hasebe
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2008-05-08 00:00:00 +09:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: hpricot
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+    version:
+- !ruby/object:Gem::Dependency
+  name: hoe
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.5.1
+    version:
+description: A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained  tagger that assigns POS tags to English text based on a lookup dictionary and  a set of probability values. The tagger assigns appropriate tags based on  conditional probabilities--it examines the preceding tag to determine the  appropriate tag for the current word. Unknown words are classified according to  word morphology or can be set to be treated as nouns or other parts of speech.   The tagger also extracts as many nouns and noun phrases as it can, using a set  of regular expressions.
+email: yohasebe@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files:
+- History.txt
+- LICENSE.txt
+- Manifest.txt
+- README.txt
+files:
+- History.txt
+- LICENSE.txt
+- Manifest.txt
+- README.txt
+- Rakefile
+- lib/engtagger.rb
+- lib/engtagger/porter.rb
+- lib/engtagger/pos_tags.hash
+- lib/engtagger/pos_words.hash
+- lib/engtagger/tags.yml
+- lib/engtagger/unknown.yml
+- lib/engtagger/words.yml
+- test/test_engtagger.rb
+has_rdoc: true
+homepage: http://engtagger.rubyforge.org
+post_install_message:
+rdoc_options:
+- --main
+- README.txt
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project: engtagger
+rubygems_version: 1.1.0
+signing_key:
+specification_version: 2
+summary: English Part-of-Speech Tagger Library; a Ruby port of Lingua::EN::Tagger
+test_files:
+- test/test_engtagger.rb