RubyGems - czech-stemmer - Versions diffs - 0.0.0 - Mend

czech-stemmer 0.0.0

Files changed (17) hide show

checksums.yaml +7 -0
data/.document +5 -0
data/Gemfile +9 -0
data/Gemfile.lock +80 -0
data/LICENSE.txt +20 -0
data/README.markdown +10 -0
data/Rakefile +51 -0
data/VERSION +1 -0
data/czech-stemmer.gemspec +66 -0
data/lib/czech-stemmer.rb +125 -0
data/test/CzechStemmer.java +173 -0
data/test/TestCzechStemmer.java +300 -0
data/test/TestCzechStemmer.java.txt +300 -0
data/test/helper.rb +2 -0
data/test/java_test_converter.bash +7 -0
data/test/test_czech-stemmer.rb +221 -0
metadata +130 -0

data/test/TestCzechStemmer.java ADDED Viewed

@@ -0,0 +1,300 @@
+package org.apache.lucene.analysis.cz;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
+/**
+ * Test the Czech Stemmer.
+ *
+ * Note: its algorithmic, so some stems are nonsense
+ *
+ */
+public class TestCzechStemmer extends BaseTokenStreamTestCase {
+  /**
+   * Test showing how masculine noun forms conflate
+   */
+  public void testMasculineNouns() throws IOException {
+    CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
+    /* animate ending with a hard consonant */
+    assertAnalyzesTo(cz, "pán", new String[] { "pán" });
+    assertAnalyzesTo(cz, "páni", new String[] { "pán" });
+    assertAnalyzesTo(cz, "pánové", new String[] { "pán" });
+    assertAnalyzesTo(cz, "pána", new String[] { "pán" });
+    assertAnalyzesTo(cz, "pánů", new String[] { "pán" });
+    assertAnalyzesTo(cz, "pánovi", new String[] { "pán" });
+    assertAnalyzesTo(cz, "pánům", new String[] { "pán" });
+    assertAnalyzesTo(cz, "pány", new String[] { "pán" });
+    assertAnalyzesTo(cz, "páne", new String[] { "pán" });
+    assertAnalyzesTo(cz, "pánech", new String[] { "pán" });
+    assertAnalyzesTo(cz, "pánem", new String[] { "pán" });
+    /* inanimate ending with hard consonant */
+    assertAnalyzesTo(cz, "hrad", new String[] { "hrad" });
+    assertAnalyzesTo(cz, "hradu", new String[] { "hrad" });
+    assertAnalyzesTo(cz, "hrade", new String[] { "hrad" });
+    assertAnalyzesTo(cz, "hradem", new String[] { "hrad" });
+    assertAnalyzesTo(cz, "hrady", new String[] { "hrad" });
+    assertAnalyzesTo(cz, "hradech", new String[] { "hrad" });
+    assertAnalyzesTo(cz, "hradům", new String[] { "hrad" });
+    assertAnalyzesTo(cz, "hradů", new String[] { "hrad" });
+    /* animate ending with a soft consonant */
+    assertAnalyzesTo(cz, "muž", new String[] { "muh" });
+    assertAnalyzesTo(cz, "muži", new String[] { "muh" });
+    assertAnalyzesTo(cz, "muže", new String[] { "muh" });
+    assertAnalyzesTo(cz, "mužů", new String[] { "muh" });
+    assertAnalyzesTo(cz, "mužům", new String[] { "muh" });
+    assertAnalyzesTo(cz, "mužích", new String[] { "muh" });
+    assertAnalyzesTo(cz, "mužem", new String[] { "muh" });
+    /* inanimate ending with a soft consonant */
+    assertAnalyzesTo(cz, "stroj", new String[] { "stroj" });
+    assertAnalyzesTo(cz, "stroje", new String[] { "stroj" });
+    assertAnalyzesTo(cz, "strojů", new String[] { "stroj" });
+    assertAnalyzesTo(cz, "stroji", new String[] { "stroj" });
+    assertAnalyzesTo(cz, "strojům", new String[] { "stroj" });
+    assertAnalyzesTo(cz, "strojích", new String[] { "stroj" });
+    assertAnalyzesTo(cz, "strojem", new String[] { "stroj" });
+    /* ending with a */
+    assertAnalyzesTo(cz, "předseda", new String[] { "předsd" });
+    assertAnalyzesTo(cz, "předsedové", new String[] { "předsd" });
+    assertAnalyzesTo(cz, "předsedy", new String[] { "předsd" });
+    assertAnalyzesTo(cz, "předsedů", new String[] { "předsd" });
+    assertAnalyzesTo(cz, "předsedovi", new String[] { "předsd" });
+    assertAnalyzesTo(cz, "předsedům", new String[] { "předsd" });
+    assertAnalyzesTo(cz, "předsedu", new String[] { "předsd" });
+    assertAnalyzesTo(cz, "předsedo", new String[] { "předsd" });
+    assertAnalyzesTo(cz, "předsedech", new String[] { "předsd" });
+    assertAnalyzesTo(cz, "předsedou", new String[] { "předsd" });
+    /* ending with e */
+    assertAnalyzesTo(cz, "soudce", new String[] { "soudk" });
+    assertAnalyzesTo(cz, "soudci", new String[] { "soudk" });
+    assertAnalyzesTo(cz, "soudců", new String[] { "soudk" });
+    assertAnalyzesTo(cz, "soudcům", new String[] { "soudk" });
+    assertAnalyzesTo(cz, "soudcích", new String[] { "soudk" });
+    assertAnalyzesTo(cz, "soudcem", new String[] { "soudk" });
+  }
+  /**
+   * Test showing how feminine noun forms conflate
+   */
+  public void testFeminineNouns() throws IOException {
+    CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
+    /* ending with hard consonant */
+    assertAnalyzesTo(cz, "kost", new String[] { "kost" });
+    assertAnalyzesTo(cz, "kosti", new String[] { "kost" });
+    assertAnalyzesTo(cz, "kostí", new String[] { "kost" });
+    assertAnalyzesTo(cz, "kostem", new String[] { "kost" });
+    assertAnalyzesTo(cz, "kostech", new String[] { "kost" });
+    assertAnalyzesTo(cz, "kostmi", new String[] { "kost" });
+    /* ending with a soft consonant */
+    // note: in this example sing nom. and sing acc. don't conflate w/ the rest
+    assertAnalyzesTo(cz, "píseň", new String[] { "písň" });
+    assertAnalyzesTo(cz, "písně", new String[] { "písn" });
+    assertAnalyzesTo(cz, "písni", new String[] { "písn" });
+    assertAnalyzesTo(cz, "písněmi", new String[] { "písn" });
+    assertAnalyzesTo(cz, "písních", new String[] { "písn" });
+    assertAnalyzesTo(cz, "písním", new String[] { "písn" });
+    /* ending with e */
+    assertAnalyzesTo(cz, "růže", new String[] { "růh" });
+    assertAnalyzesTo(cz, "růží", new String[] { "růh" });
+    assertAnalyzesTo(cz, "růžím", new String[] { "růh" });
+    assertAnalyzesTo(cz, "růžích", new String[] { "růh" });
+    assertAnalyzesTo(cz, "růžemi", new String[] { "růh" });
+    assertAnalyzesTo(cz, "růži", new String[] { "růh" });
+    /* ending with a */
+    assertAnalyzesTo(cz, "žena", new String[] { "žn" });
+    assertAnalyzesTo(cz, "ženy", new String[] { "žn" });
+    assertAnalyzesTo(cz, "žen", new String[] { "žn" });
+    assertAnalyzesTo(cz, "ženě", new String[] { "žn" });
+    assertAnalyzesTo(cz, "ženám", new String[] { "žn" });
+    assertAnalyzesTo(cz, "ženu", new String[] { "žn" });
+    assertAnalyzesTo(cz, "ženo", new String[] { "žn" });
+    assertAnalyzesTo(cz, "ženách", new String[] { "žn" });
+    assertAnalyzesTo(cz, "ženou", new String[] { "žn" });
+    assertAnalyzesTo(cz, "ženami", new String[] { "žn" });
+  }
+  /**
+   * Test showing how neuter noun forms conflate
+   */
+  public void testNeuterNouns() throws IOException {
+    CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
+    /* ending with o */
+    assertAnalyzesTo(cz, "město", new String[] { "měst" });
+    assertAnalyzesTo(cz, "města", new String[] { "měst" });
+    assertAnalyzesTo(cz, "měst", new String[] { "měst" });
+    assertAnalyzesTo(cz, "městu", new String[] { "měst" });
+    assertAnalyzesTo(cz, "městům", new String[] { "měst" });
+    assertAnalyzesTo(cz, "městě", new String[] { "měst" });
+    assertAnalyzesTo(cz, "městech", new String[] { "měst" });
+    assertAnalyzesTo(cz, "městem", new String[] { "měst" });
+    assertAnalyzesTo(cz, "městy", new String[] { "měst" });
+    /* ending with e */
+    assertAnalyzesTo(cz, "moře", new String[] { "moř" });
+    assertAnalyzesTo(cz, "moří", new String[] { "moř" });
+    assertAnalyzesTo(cz, "mořím", new String[] { "moř" });
+    assertAnalyzesTo(cz, "moři", new String[] { "moř" });
+    assertAnalyzesTo(cz, "mořích", new String[] { "moř" });
+    assertAnalyzesTo(cz, "mořem", new String[] { "moř" });
+    /* ending with ě */
+    assertAnalyzesTo(cz, "kuře", new String[] { "kuř" });
+    assertAnalyzesTo(cz, "kuřata", new String[] { "kuř" });
+    assertAnalyzesTo(cz, "kuřete", new String[] { "kuř" });
+    assertAnalyzesTo(cz, "kuřat", new String[] { "kuř" });
+    assertAnalyzesTo(cz, "kuřeti", new String[] { "kuř" });
+    assertAnalyzesTo(cz, "kuřatům", new String[] { "kuř" });
+    assertAnalyzesTo(cz, "kuřatech", new String[] { "kuř" });
+    assertAnalyzesTo(cz, "kuřetem", new String[] { "kuř" });
+    assertAnalyzesTo(cz, "kuřaty", new String[] { "kuř" });
+    /* ending with í */
+    assertAnalyzesTo(cz, "stavení", new String[] { "stavn" });
+    assertAnalyzesTo(cz, "stavením", new String[] { "stavn" });
+    assertAnalyzesTo(cz, "staveních", new String[] { "stavn" });
+    assertAnalyzesTo(cz, "staveními", new String[] { "stavn" });
+  }
+  /**
+   * Test showing how adjectival forms conflate
+   */
+  public void testAdjectives() throws IOException {
+    CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
+    /* ending with ý/á/é */
+    assertAnalyzesTo(cz, "mladý", new String[] { "mlad" });
+    assertAnalyzesTo(cz, "mladí", new String[] { "mlad" });
+    assertAnalyzesTo(cz, "mladého", new String[] { "mlad" });
+    assertAnalyzesTo(cz, "mladých", new String[] { "mlad" });
+    assertAnalyzesTo(cz, "mladému", new String[] { "mlad" });
+    assertAnalyzesTo(cz, "mladým", new String[] { "mlad" });
+    assertAnalyzesTo(cz, "mladé", new String[] { "mlad" });
+    assertAnalyzesTo(cz, "mladém", new String[] { "mlad" });
+    assertAnalyzesTo(cz, "mladými", new String[] { "mlad" });
+    assertAnalyzesTo(cz, "mladá", new String[] { "mlad" });
+    assertAnalyzesTo(cz, "mladou", new String[] { "mlad" });
+    /* ending with í */
+    assertAnalyzesTo(cz, "jarní", new String[] { "jarn" });
+    assertAnalyzesTo(cz, "jarního", new String[] { "jarn" });
+    assertAnalyzesTo(cz, "jarních", new String[] { "jarn" });
+    assertAnalyzesTo(cz, "jarnímu", new String[] { "jarn" });
+    assertAnalyzesTo(cz, "jarním", new String[] { "jarn" });
+    assertAnalyzesTo(cz, "jarními", new String[] { "jarn" });
+  }
+  /**
+   * Test some possessive suffixes
+   */
+  public void testPossessive() throws IOException {
+    CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
+    assertAnalyzesTo(cz, "Karlův", new String[] { "karl" });
+    assertAnalyzesTo(cz, "jazykový", new String[] { "jazyk" });
+  }
+  /**
+   * Test some exceptional rules, implemented as rewrites.
+   */
+  public void testExceptions() throws IOException {
+    CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
+    /* rewrite of št -> sk */
+    assertAnalyzesTo(cz, "český", new String[] { "česk" });
+    assertAnalyzesTo(cz, "čeští", new String[] { "česk" });
+    /* rewrite of čt -> ck */
+    assertAnalyzesTo(cz, "anglický", new String[] { "anglick" });
+    assertAnalyzesTo(cz, "angličtí", new String[] { "anglick" });
+    /* rewrite of z -> h */
+    assertAnalyzesTo(cz, "kniha", new String[] { "knih" });
+    assertAnalyzesTo(cz, "knize", new String[] { "knih" });
+    /* rewrite of ž -> h */
+    assertAnalyzesTo(cz, "mazat", new String[] { "mah" });
+    assertAnalyzesTo(cz, "mažu", new String[] { "mah" });
+    /* rewrite of c -> k */
+    assertAnalyzesTo(cz, "kluk", new String[] { "kluk" });
+    assertAnalyzesTo(cz, "kluci", new String[] { "kluk" });
+    assertAnalyzesTo(cz, "klucích", new String[] { "kluk" });
+    /* rewrite of č -> k */
+    assertAnalyzesTo(cz, "hezký", new String[] { "hezk" });
+    assertAnalyzesTo(cz, "hezčí", new String[] { "hezk" });
+    /* rewrite of *ů* -> *o* */
+    assertAnalyzesTo(cz, "hůl", new String[] { "hol" });
+    assertAnalyzesTo(cz, "hole", new String[] { "hol" });
+    /* rewrite of e* -> * */
+    assertAnalyzesTo(cz, "deska", new String[] { "desk" });
+    assertAnalyzesTo(cz, "desek", new String[] { "desk" });
+  }
+  /**
+   * Test that very short words are not stemmed.
+   */
+  public void testDontStem() throws IOException {
+    CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
+    assertAnalyzesTo(cz, "e", new String[] { "e" });
+    assertAnalyzesTo(cz, "zi", new String[] { "zi" });
+  }
+  public void testWithKeywordAttribute() throws IOException {
+    CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
+    set.add("hole");
+    CzechStemFilter filter = new CzechStemFilter(new SetKeywordMarkerFilter(
+        new MockTokenizer(new StringReader("hole desek"), MockTokenizer.WHITESPACE, false), set));
+    assertTokenStreamContents(filter, new String[] { "hole", "desk" });
+  }
+  public void testEmptyTerm() throws IOException {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new KeywordTokenizer(reader);
+        return new TokenStreamComponents(tokenizer, new CzechStemFilter(tokenizer));
+      }
+    };
+    checkOneTerm(a, "", "");
+  }
+}

data/test/TestCzechStemmer.java.txt ADDED Viewed

@@ -0,0 +1,300 @@
+package org.apache.lucene.analysis.cz;
+#
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
+#*
+ * Test the Czech Stemmer.
+ *
+ * Note: its algorithmic, so some stems are nonsense
+ *
+public class TestCzechStemmer extends BaseTokenStreamTestCase {
+  #*
+   * Test showing how masculine noun forms conflate
+  public void testMasculineNouns() throws IOException {
+    CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
+    # animate ending with a hard consonant
+    assert_analyzes_to( "pán",  "pán" )
+    assert_analyzes_to( "páni",  "pán" )
+    assert_analyzes_to( "pánové",  "pán" )
+    assert_analyzes_to( "pána",  "pán" )
+    assert_analyzes_to( "pánů",  "pán" )
+    assert_analyzes_to( "pánovi",  "pán" )
+    assert_analyzes_to( "pánům",  "pán" )
+    assert_analyzes_to( "pány",  "pán" )
+    assert_analyzes_to( "páne",  "pán" )
+    assert_analyzes_to( "pánech",  "pán" )
+    assert_analyzes_to( "pánem",  "pán" )
+    # inanimate ending with hard consonant
+    assert_analyzes_to( "hrad",  "hrad" )
+    assert_analyzes_to( "hradu",  "hrad" )
+    assert_analyzes_to( "hrade",  "hrad" )
+    assert_analyzes_to( "hradem",  "hrad" )
+    assert_analyzes_to( "hrady",  "hrad" )
+    assert_analyzes_to( "hradech",  "hrad" )
+    assert_analyzes_to( "hradům",  "hrad" )
+    assert_analyzes_to( "hradů",  "hrad" )
+    # animate ending with a soft consonant
+    assert_analyzes_to( "muž",  "muh" )
+    assert_analyzes_to( "muži",  "muh" )
+    assert_analyzes_to( "muže",  "muh" )
+    assert_analyzes_to( "mužů",  "muh" )
+    assert_analyzes_to( "mužům",  "muh" )
+    assert_analyzes_to( "mužích",  "muh" )
+    assert_analyzes_to( "mužem",  "muh" )
+    # inanimate ending with a soft consonant
+    assert_analyzes_to( "stroj",  "stroj" )
+    assert_analyzes_to( "stroje",  "stroj" )
+    assert_analyzes_to( "strojů",  "stroj" )
+    assert_analyzes_to( "stroji",  "stroj" )
+    assert_analyzes_to( "strojům",  "stroj" )
+    assert_analyzes_to( "strojích",  "stroj" )
+    assert_analyzes_to( "strojem",  "stroj" )
+    # ending with a
+    assert_analyzes_to( "předseda",  "předsd" )
+    assert_analyzes_to( "předsedové",  "předsd" )
+    assert_analyzes_to( "předsedy",  "předsd" )
+    assert_analyzes_to( "předsedů",  "předsd" )
+    assert_analyzes_to( "předsedovi",  "předsd" )
+    assert_analyzes_to( "předsedům",  "předsd" )
+    assert_analyzes_to( "předsedu",  "předsd" )
+    assert_analyzes_to( "předsedo",  "předsd" )
+    assert_analyzes_to( "předsedech",  "předsd" )
+    assert_analyzes_to( "předsedou",  "předsd" )
+    # ending with e
+    assert_analyzes_to( "soudce",  "soudk" )
+    assert_analyzes_to( "soudci",  "soudk" )
+    assert_analyzes_to( "soudců",  "soudk" )
+    assert_analyzes_to( "soudcům",  "soudk" )
+    assert_analyzes_to( "soudcích",  "soudk" )
+    assert_analyzes_to( "soudcem",  "soudk" )
+  }
+  #*
+   * Test showing how feminine noun forms conflate
+  public void testFeminineNouns() throws IOException {
+    CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
+    # ending with hard consonant
+    assert_analyzes_to( "kost",  "kost" )
+    assert_analyzes_to( "kosti",  "kost" )
+    assert_analyzes_to( "kostí",  "kost" )
+    assert_analyzes_to( "kostem",  "kost" )
+    assert_analyzes_to( "kostech",  "kost" )
+    assert_analyzes_to( "kostmi",  "kost" )
+    # ending with a soft consonant
+    // note: in this example sing nom. and sing acc. don't conflate w/ the rest
+    assert_analyzes_to( "píseň",  "písň" )
+    assert_analyzes_to( "písně",  "písn" )
+    assert_analyzes_to( "písni",  "písn" )
+    assert_analyzes_to( "písněmi",  "písn" )
+    assert_analyzes_to( "písních",  "písn" )
+    assert_analyzes_to( "písním",  "písn" )
+    # ending with e
+    assert_analyzes_to( "růže",  "růh" )
+    assert_analyzes_to( "růží",  "růh" )
+    assert_analyzes_to( "růžím",  "růh" )
+    assert_analyzes_to( "růžích",  "růh" )
+    assert_analyzes_to( "růžemi",  "růh" )
+    assert_analyzes_to( "růži",  "růh" )
+    # ending with a
+    assert_analyzes_to( "žena",  "žn" )
+    assert_analyzes_to( "ženy",  "žn" )
+    assert_analyzes_to( "žen",  "žn" )
+    assert_analyzes_to( "ženě",  "žn" )
+    assert_analyzes_to( "ženám",  "žn" )
+    assert_analyzes_to( "ženu",  "žn" )
+    assert_analyzes_to( "ženo",  "žn" )
+    assert_analyzes_to( "ženách",  "žn" )
+    assert_analyzes_to( "ženou",  "žn" )
+    assert_analyzes_to( "ženami",  "žn" )
+  }
+  #*
+   * Test showing how neuter noun forms conflate
+  public void testNeuterNouns() throws IOException {
+    CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
+    # ending with o
+    assert_analyzes_to( "město",  "měst" )
+    assert_analyzes_to( "města",  "měst" )
+    assert_analyzes_to( "měst",  "měst" )
+    assert_analyzes_to( "městu",  "měst" )
+    assert_analyzes_to( "městům",  "měst" )
+    assert_analyzes_to( "městě",  "měst" )
+    assert_analyzes_to( "městech",  "měst" )
+    assert_analyzes_to( "městem",  "měst" )
+    assert_analyzes_to( "městy",  "měst" )
+    # ending with e
+    assert_analyzes_to( "moře",  "moř" )
+    assert_analyzes_to( "moří",  "moř" )
+    assert_analyzes_to( "mořím",  "moř" )
+    assert_analyzes_to( "moři",  "moř" )
+    assert_analyzes_to( "mořích",  "moř" )
+    assert_analyzes_to( "mořem",  "moř" )
+    # ending with ě
+    assert_analyzes_to( "kuře",  "kuř" )
+    assert_analyzes_to( "kuřata",  "kuř" )
+    assert_analyzes_to( "kuřete",  "kuř" )
+    assert_analyzes_to( "kuřat",  "kuř" )
+    assert_analyzes_to( "kuřeti",  "kuř" )
+    assert_analyzes_to( "kuřatům",  "kuř" )
+    assert_analyzes_to( "kuřatech",  "kuř" )
+    assert_analyzes_to( "kuřetem",  "kuř" )
+    assert_analyzes_to( "kuřaty",  "kuř" )
+    # ending with í
+    assert_analyzes_to( "stavení",  "stavn" )
+    assert_analyzes_to( "stavením",  "stavn" )
+    assert_analyzes_to( "staveních",  "stavn" )
+    assert_analyzes_to( "staveními",  "stavn" )
+  }
+  #*
+   * Test showing how adjectival forms conflate
+  public void testAdjectives() throws IOException {
+    CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
+    # ending with ý/á/é
+    assert_analyzes_to( "mladý",  "mlad" )
+    assert_analyzes_to( "mladí",  "mlad" )
+    assert_analyzes_to( "mladého",  "mlad" )
+    assert_analyzes_to( "mladých",  "mlad" )
+    assert_analyzes_to( "mladému",  "mlad" )
+    assert_analyzes_to( "mladým",  "mlad" )
+    assert_analyzes_to( "mladé",  "mlad" )
+    assert_analyzes_to( "mladém",  "mlad" )
+    assert_analyzes_to( "mladými",  "mlad" )
+    assert_analyzes_to( "mladá",  "mlad" )
+    assert_analyzes_to( "mladou",  "mlad" )
+    # ending with í
+    assert_analyzes_to( "jarní",  "jarn" )
+    assert_analyzes_to( "jarního",  "jarn" )
+    assert_analyzes_to( "jarních",  "jarn" )
+    assert_analyzes_to( "jarnímu",  "jarn" )
+    assert_analyzes_to( "jarním",  "jarn" )
+    assert_analyzes_to( "jarními",  "jarn" )
+  }
+  #*
+   * Test some possessive suffixes
+  public void testPossessive() throws IOException {
+    CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
+    assert_analyzes_to( "Karlův",  "karl" )
+    assert_analyzes_to( "jazykový",  "jazyk" )
+  }
+  #*
+   * Test some exceptional rules, implemented as rewrites.
+  public void testExceptions() throws IOException {
+    CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
+    # rewrite of št -> sk
+    assert_analyzes_to( "český",  "česk" )
+    assert_analyzes_to( "čeští",  "česk" )
+    # rewrite of čt -> ck
+    assert_analyzes_to( "anglický",  "anglick" )
+    assert_analyzes_to( "angličtí",  "anglick" )
+    # rewrite of z -> h
+    assert_analyzes_to( "kniha",  "knih" )
+    assert_analyzes_to( "knize",  "knih" )
+    # rewrite of ž -> h
+    assert_analyzes_to( "mazat",  "mah" )
+    assert_analyzes_to( "mažu",  "mah" )
+    # rewrite of c -> k
+    assert_analyzes_to( "kluk",  "kluk" )
+    assert_analyzes_to( "kluci",  "kluk" )
+    assert_analyzes_to( "klucích",  "kluk" )
+    # rewrite of č -> k
+    assert_analyzes_to( "hezký",  "hezk" )
+    assert_analyzes_to( "hezčí",  "hezk" )
+    # rewrite of *ů* -> *o*
+    assert_analyzes_to( "hůl",  "hol" )
+    assert_analyzes_to( "hole",  "hol" )
+    # rewrite of e* -> *
+    assert_analyzes_to( "deska",  "desk" )
+    assert_analyzes_to( "desek",  "desk" )
+  }
+  #*
+   * Test that very short words are not stemmed.
+  public void testDontStem() throws IOException {
+    CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
+    assert_analyzes_to( "e",  "e" )
+    assert_analyzes_to( "zi",  "zi" )
+  }
+  public void testWithKeywordAttribute() throws IOException {
+    CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
+    set.add("hole");
+    CzechStemFilter filter = new CzechStemFilter(new SetKeywordMarkerFilter(
+        new MockTokenizer(new StringReader("hole desek"), MockTokenizer.WHITESPACE, false), set));
+    assertTokenStreamContents(filter,  "hole", "desk" )
+  }
+  public void testEmptyTerm() throws IOException {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new KeywordTokenizer(reader);
+        return new TokenStreamComponents(tokenizer, new CzechStemFilter(tokenizer));
+      }
+    };
+    checkOneTerm(a, "", "");
+  }
+}

data/test/helper.rb ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ require "minitest/autorun"
2	+ require "czech-stemmer"

data/test/java_test_converter.bash ADDED Viewed

@@ -0,0 +1,7 @@
+cat TestCzechStemmer.java | sed s/assertAnalyzesTo\(cz\,/assert_analyzes_to\(/g | \
+ sed s/\,\ new\ String\\[\\]\ \{/\,\ /g | \
+ sed s/\}\)\;/\)/g | \
+ sed 's/\/\*/\#/g' | \
+ sed 's/\*\///g' | \
+ tee TestCzechStemmer.java.txt