words 0.4.0 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.markdown +4 -2
- data/Rakefile +38 -41
- data/VERSION +1 -1
- data/bin/build_wordnet +44 -8
- data/examples.rb +4 -3
- data/lib/evocations.rb +3 -7
- data/lib/homographs.rb +2 -8
- data/lib/relation.rb +54 -55
- data/lib/synset.rb +134 -132
- data/lib/wordnet_connectors/pure_wordnet_connection.rb +130 -48
- data/lib/wordnet_connectors/tokyo_wordnet_connection.rb +73 -17
- data/lib/words.rb +108 -22
- data/spec/words_spec.rb +38 -0
- data/words.gemspec +6 -7
- metadata +5 -6
- data/lib/wordnet_connection.rb +0 -187
@@ -1,18 +1,55 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
1
3
|
module Words
|
2
4
|
|
5
|
+
# Provides a pure ruby connector to the Wordnet dataset.
|
3
6
|
class PureWordnetConnection
|
4
7
|
|
8
|
+
# Convert single letter POS to it's multi-letter equivilent
|
5
9
|
SHORT_TO_POS_FILE_TYPE = { 'a' => 'adj', 'r' => 'adv', 'n' => 'noun', 'v' => 'verb' }
|
10
|
+
|
11
|
+
# Set of indexes for seeking directly into wordnet files to identify terms with significantly improved performance
|
6
12
|
INDEXES = {
|
7
13
|
:noun => {"mv"=>2908615, "fa"=>1455677, "g-"=>1695451, "hy"=>2196287, "ac"=>21116, "wr"=>4743086, "rt"=>3724403, "k_"=>2405676, "mw"=>2908680, "fb"=>1539515, "g."=>1695573, "hz"=>2219696, "ad"=>48269, "ws"=>4747643, "ru"=>3724431, "mx"=>2908742, "fc"=>1539583, "80"=>6057, "ae"=>63445, "wt"=>4747670, "rv"=>3740230, "ka"=>2405742, "l-"=>2459655, "my"=>2908771, "fd"=>1539637, "af"=>68288, "wu"=>4747756, "rw"=>3740258, "kb"=>2417524, "l."=>2459745, "fe"=>1539722, "ag"=>74279, "wv"=>4748078, "kc"=>2417632, "ah"=>83260, "ry"=>3740424, "pa"=>3143343, "36"=>5141, "ai"=>83677, "ww"=>4748110, "pb"=>3211047, "ke"=>2417664, "aj"=>91267, "v-"=>4545234, "pc"=>3211172, "fh"=>1559167, "ak"=>91562, "wy"=>4748137, "v."=>4545387, "ua"=>4496561, "pd"=>3211308, "kg"=>2427122, "fi"=>1559226, "al"=>92464, "ub"=>4496594, "pe"=>3211419, "2n"=>4947, "fj"=>1596225, "am"=>130827, "kh"=>2427183, "uc"=>4496797, "pf"=>3263095, "88"=>6083, "an"=>154839, "ki"=>2428739, "za"=>4773142, "ud"=>4496830, "fl"=>1596256, "ao"=>203539, "zb"=>4775763, "ph"=>3263286, "fm"=>1622351, "ap"=>204006, "uf"=>4496962, "pi"=>3293279, "fn"=>1622416, "aq"=>218174, "kk"=>2442519, "zd"=>4775847, "ug"=>4497019, "pj"=>3328895, "fo"=>1622444, "ar"=>219963, "kl"=>2442551, "ze"=>4775874, "uh"=>4497483, "pk"=>3328925, "fp"=>1650875, "as"=>262743, "km"=>2443913, "ui"=>4497543, "pl"=>3329011, "at"=>282628, "kn"=>2443973, "pm"=>3357376, "fr"=>1650935, "au"=>299805, "ko"=>2448754, "zh"=>4778739, "uk"=>4497767, "pn"=>3357459, "fs"=>1681993, "av"=>316371, "kp"=>2453337, "zi"=>4778934, "ul"=>4498102, "po"=>3358476, "ft"=>1682056, "aw"=>319552, "um"=>4501463, "fu"=>1682252, "ax"=>320182, "1-"=>1892, "kr"=>2453390, "un"=>4503199, "pp"=>3416671, "ay"=>321448, "ks"=>2455025, "zl"=>4782157, "fw"=>1695021, "az"=>322115, "kt"=>2455090, "d_"=>1083112, "up"=>4528358, "pr"=>3416755, "o'"=>3029255, "10"=>1959, "ku"=>2455116, "zn"=>4782189, "ps"=>3483993, "e-"=>1289529, "fy"=>1695051, "kv"=>2458073, "da"=>1083181, "zo"=>4782217, "ur"=>4532258, "pt"=>3492829, "i_"=>2220034, "11"=>2218, "kw"=>2458174, "db"=>1108193, "e."=>1289664, "us"=>4538820, "t'"=>4259996, "pu"=>3496345, "60"=>5843, "12"=>2315, "dc"=>1108287, "ut"=>4542211, "pv"=>3517927, "ia"=>2220399, "13"=>2445, "ky"=>2458844, "zr"=>4784927, "pw"=>3517990, "ib"=>2220863, "j."=>2341407, "14"=>2471, "dd"=>1108386, "zs"=>4784956, "uu"=>4544207, "px"=>3518017, "ic"=>2221692, "15"=>2558, "de"=>1108520, "uv"=>4544342, "py"=>3518043, "o."=>3029509, "na"=>2919040, "id"=>2226538, "16"=>2733, "df"=>1168182, "zu"=>4784989, "s_"=>3741387, "nb"=>2942448, "ie"=>2230327, "17"=>2788, "dg"=>1168212, "ux"=>4544722, "t-"=>4260104, "nc"=>2942542, "if"=>2230421, "18"=>3024, "dh"=>1168244, "zw"=>4785347, "uy"=>4544863, "t."=>4260425, "sa"=>3741419, "nd"=>2942608, "ig"=>2230448, "19"=>3319, "di"=>1168953, "x_"=>4749915, "uz"=>4544913, "sb"=>3800209, "ne"=>2942718, "dj"=>1223633, "zy"=>4785471, "y-"=>4755272, "sc"=>3800328, "dk"=>1223962, "xa"=>4749991, "sd"=>3836240, "ng"=>2975369, "ii"=>2232411, "dl"=>1224061, "se"=>3836272, "nh"=>2975761, "ij"=>2232906, "dm"=>1224120, "xc"=>4750937, "sf"=>3898201, "ni"=>2975793, "ik"=>2233046, "dn"=>1224525, "sg"=>3898276, "nj"=>2989622, "il"=>2233145, "do"=>1224823, "y2"=>4755339, "xe"=>4750963, "sh"=>3898399, "im"=>2236982, "dp"=>1251815, "si"=>3934020, "nl"=>2989719, "in"=>2250132, "sj"=>3974373, "nm"=>2989774, "io"=>2317192, "dr"=>1251968, "xh"=>4752829, "sk"=>3974412, "ip"=>2319242, "ds"=>1271920, "xi"=>4752879, "sl"=>3982232, "nn"=>2989842, "iq"=>2320204, "dt"=>1272024, "sm"=>3995291, "no"=>2989930, "ir"=>2320265, "du"=>1272082, "sn"=>4003308, "np"=>3016438, "is"=>2328830, "dv"=>1284206, "xl"=>4753577, "so"=>4011968, "it"=>2336645, "b_"=>324352, "dw"=>1284263, "xm"=>4753603, "sp"=>4051506, "nr"=>3016535, "iu"=>2338757, "0"=>1840, "4-"=>5374, "sq"=>4097051, "ns"=>3016775, "iv"=>2338786, "1"=>1865, "ba"=>324554, "c-"=>600455, "dy"=>1286409, "xo"=>4753634, "sr"=>4102220, "nt"=>3016984, "8_"=>6119, "iw"=>2340321, "2"=>4177, "c."=>600659, "dz"=>1289430, "g_"=>1695801, "nu"=>3017043, "ix"=>2340452, "9-"=>6205, "3"=>4985, "bb"=>390069, "40"=>5406, "ss"=>4102507, "nv"=>3026658, "iy"=>2341048, "4"=>5349, "3d"=>5205, "ga"=>1695861, "h-"=>2030546, "st"=>4102714, "nw"=>3026690, "iz"=>2341117, "9/"=>6238, "5"=>5594, "bd"=>390218, "gb"=>1726120, "h."=>2030576, "su"=>4180331, "90"=>6271, "6"=>5818, "be"=>390276, "c2"=>601143, "gc"=>1726268, "xt"=>4753701, "sv"=>4232564, "ny"=>3026772, "7"=>5946, "la"=>2459898, "m-"=>2643999, "gd"=>1726351, "sw"=>4232896, "q_"=>3524972, "8"=>6032, "lb"=>2507825, "m."=>2644096, "44"=>5495, "ge"=>1726452, "xv"=>4753754, "r-"=>3544131, "9"=>6180, "bh"=>428251, "lc"=>2507915, "h2"=>2030821, "sy"=>4247569, "r."=>3544158, "qa"=>3525003, "bi"=>428758, "ld"=>2507971, "v_"=>4545417, "sz"=>4259706, "bj"=>454188, "le"=>2508074, "m1"=>2644168, "xx"=>4753864, "qc"=>3525650, "bk"=>454250, "lf"=>2545647, "m2"=>2644194, "gh"=>1914825, "xy"=>4754258, "w."=>4622501, "va"=>4545477, "bl"=>454276, "lg"=>2545676, "m3"=>2644220, "gi"=>1915953, "qe"=>3525677, "bm"=>487643, "lh"=>2545732, "gj"=>1928001, "vc"=>4562340, "bn"=>487795, "li"=>2545866, "vd"=>4562367, "bo"=>487822, "lj"=>2588790, "gl"=>1928034, "ve"=>4562424, "bp"=>527090, "gm"=>1941192, "vf"=>4587559, "qi"=>3525733, "3r"=>5272, "gn"=>1941253, "br"=>527207, "ll"=>2588826, "go"=>1942339, "vh"=>4587589, "lm"=>2589254, "gp"=>1965489, "a'"=>6392, "bs"=>567010, "vi"=>4587630, "3t"=>5322, "bt"=>567093, "lo"=>2589280, "gr"=>1965634, "bu"=>567123, "lp"=>2623355, "gs"=>2010072, "bv"=>598604, "vl"=>4612572, "qo"=>3526131, "1_"=>3606, "bw"=>598664, "2-"=>4204, "lr"=>2623408, "gu"=>2010162, "8v"=>6153, "k'"=>2405456, "ls"=>2623434, "a-"=>6423, "by"=>598787, "vo"=>4613249, "lt"=>2623463, "e_"=>1290183, "6_"=>5911, "gw"=>2026208, "a."=>6630, "lu"=>2623552, "'h"=>1740, "20"=>4330, "ea"=>1290252, "gy"=>2026300, "21"=>4385, "vr"=>4621015, "lw"=>2635752, "eb"=>1300178, "f."=>1455392, "j_"=>2341888, "qu"=>3526162, "lx"=>2635783, "ec"=>1301281, "70"=>5971, "22"=>4411, "vt"=>4621044, "ly"=>2635907, "ed"=>1308417, "ja"=>2341922, "k-"=>2405491, "23"=>4474, "vu"=>4621076, "qw"=>3544030, "o_"=>3029601, "k."=>2405619, "24"=>4500, "p-"=>3142944, "ee"=>1316159, "25"=>4636, "p."=>3143064, "oa"=>3029664, "ef"=>1316593, "jd"=>2362188, "26"=>4662, "t_"=>4260563, "p/"=>3143308, "ob"=>3030924, "eg"=>1318289, "je"=>2362216, "27"=>4688, "vx"=>4622352, "u-"=>4495612, "oc"=>3037012, "eh"=>1321628, "jf"=>2371138, "k2"=>2405647, "28"=>4714, "vy"=>4622382, "u."=>4495708, "ta"=>4260664, "od"=>3042646, "ei"=>1321758, "29"=>4740, "y_"=>4755366, "tb"=>4295216, "oe"=>3044953, "ej"=>1323919, "'s"=>1771, "jh"=>2371165, "z-"=>4773112, "tc"=>4295357, "of"=>3046532, "ek"=>1324264, "78"=>5997, "ji"=>2371193, "ya"=>4755402, "td"=>4295640, "og"=>3049310, "el"=>1324361, "yb"=>4759174, "te"=>4295669, "oh"=>3049737, "em"=>1348056, "u3"=>4496533, "oi"=>3050182, "en"=>1357595, "oj"=>3052575, "eo"=>1377701, "ye"=>4759264, "th"=>4330947, "ok"=>3052696, "1s"=>3787, "ep"=>1378260, "ti"=>4366648, "ol"=>3053511, "jn"=>2373545, "eq"=>1387580, "yg"=>4767903, "tj"=>4385574, "om"=>3062383, "jo"=>2373601, "er"=>1391721, "yh"=>4767972, "tk"=>4385664, "on"=>3064512, "d'"=>1082835, "es"=>1401937, "yi"=>4768028, "tl"=>4385691, "et"=>1408856, "tm"=>4385787, "oo"=>3070387, "jr"=>2392018, "eu"=>1413487, "tn"=>4385843, "op"=>3071039, "a"=>6297, "ev"=>1427580, "yl"=>4768444, "to"=>4385934, "b"=>323845, "c_"=>601171, "ew"=>1432034, "ym"=>4768512, "tp"=>4413193, "or"=>3081061, "c"=>600316, "ju"=>2392073, "ex"=>1432298, "n'"=>2918885, "5-"=>5619, "os"=>3115959, "d"=>1082786, "jv"=>2405234, "ca"=>601439, "d-"=>1082871, "ey"=>1452457, "yo"=>4768542, "tr"=>4413220, "ot"=>3122137, "e"=>1289463, "cb"=>712079, "d."=>1082934, "ez"=>1454953, "yp"=>4771198, "ts"=>4466928, "ou"=>3124879, "f"=>1455328, "50"=>5765, "yq"=>4771250, "ov"=>3129739, "ha"=>2030856, "i-"=>2219776, "g"=>1695338, "jy"=>2405260, "cc"=>712135, "yr"=>4771279, "tt"=>4467892, "ow"=>3136728, "hb"=>2076148, "i."=>2219806, "h"=>2030472, "cd"=>712198, "tu"=>4467944, "ox"=>3137307, "hc"=>2076182, "i"=>2219725, "ce"=>712729, "n-"=>2918921, "yt"=>4771310, "tv"=>4484640, "oy"=>3141259, "hd"=>2076237, "j"=>2341367, "cf"=>737620, "ma"=>2644246, "yu"=>4771416, "tw"=>4485217, "r_"=>3544308, "oz"=>3142126, "n."=>2918965, "he"=>2076337, "k"=>2405363, "cg"=>737739, "mb"=>2737124, "yv"=>4773040, "tx"=>4490575, "s-"=>3741191, "hf"=>2121232, "l"=>2459527, "ch"=>737800, "mc"=>2737372, "ty"=>4490610, "s."=>3741222, "ra"=>3544339, "hg"=>2121297, "m"=>2643918, "ci"=>811192, "md"=>2738186, "tz"=>4495399, "s/"=>3741360, "rb"=>3580128, "n"=>2918808, "cj"=>827445, "me"=>2738337, "x-"=>4749199, "rc"=>3580216, "o"=>3029204, "hh"=>2121341, "mf"=>2788090, "wa"=>4622931, "p"=>3142904, "hi"=>2121371, "cl"=>827472, "mg"=>2788180, "wb"=>4654707, "re"=>3580247, "q"=>3524944, "cm"=>860967, "mh"=>2788224, "rf"=>3658425, "r"=>3544069, "cn"=>861094, "mi"=>2788281, "s"=>3741105, "rg"=>3658504, "co"=>861878, "hl"=>2139669, "we"=>4654819, "t"=>4259917, "rh"=>3658530, "cp"=>1012981, "mk"=>2830687, "hm"=>2139701, "u"=>4495561, "ri"=>3667785, "ml"=>2830716, "hn"=>2139877, "v"=>4545170, "cr"=>1013175, "ho"=>2139935, "wh"=>4672549, "w"=>4622437, "cs"=>1048516, "mm"=>2830804, "hp"=>2182075, "x"=>4749153, "wi"=>4692782, "ct"=>1048663, "mn"=>2830893, "4t"=>5531, "hq"=>2182104, "y"=>4755232, "cu"=>1049194, "mo"=>2831144, "hr"=>2182134, "z"=>4773075, "rn"=>3687863, "cv"=>1068811, "mp"=>2881103, "hs"=>2182299, "wl"=>4724359, "ro"=>3688004, "2_"=>4766, "cw"=>1068869, "4w"=>5558, "ht"=>2182563, "a_"=>7001, "wm"=>4724387, "rp"=>3724343, "l'"=>2459588, "3-"=>5010, "mr"=>2881244, "hu"=>2182649, "wn"=>4724445, "b-"=>323934, "cy"=>1068938, "ms"=>2881650, "wo"=>4724472, "b."=>324186, "cz"=>1082090, "mt"=>2881981, "f_"=>1455546, "hw"=>2196252, "aa"=>7256, "wp"=>4743059, "30"=>5078, "mu"=>2882421, ".2"=>1811, "ab"=>8002},
|
8
14
|
:adj => {"2d"=>4592, "31"=>4851, "fa"=>261714, ".3"=>1880, "hy"=>340391, "ac"=>12021, "32"=>4905, "wr"=>818088, ".4"=>2020, "ad"=>18614, "80"=>7432, "ae"=>23100, "ru"=>595594, "ka"=>388840, "l-"=>392296, "33"=>4959, "my"=>455103, "81"=>7486, "af"=>24303, "34"=>5013, "fe"=>267964, "82"=>7512, "ag"=>26180, "rw"=>598392, "35"=>5067, "83"=>7538, "ah"=>28531, "36"=>5121, "pa"=>505816, "84"=>7564, "ai"=>28659, "ke"=>389430, "37"=>5175, "85"=>7590, "aj"=>29502, "38"=>5229, "v-"=>789182, "86"=>7644, "ak"=>29530, "39"=>5283, "fi"=>270598, "wy"=>819172, "87"=>7670, "al"=>29597, "2n"=>4618, "ub"=>722231, "pe"=>515525, "am"=>36386, "kh"=>389925, "88"=>7696, "an"=>40420, "ki"=>389956, "za"=>822848, "fl"=>275697, "89"=>7722, "ao"=>51761, "ph"=>523560, "ap"=>51861, "pi"=>526560, "aq"=>56753, "ug"=>722267, "fo"=>280302, "ar"=>57023, "ze"=>823049, "as"=>62836, "pl"=>530025, "at"=>67162, "kn"=>390913, "fr"=>286656, "au"=>69848, "ko"=>391903, "uk"=>722361, "pn"=>534250, "7t"=>7346, "av"=>73727, "zi"=>823276, "ul"=>722396, "po"=>534415, "aw"=>74649, "um"=>723080, "fu"=>290894, "ax"=>75702, "un"=>723684, "az"=>76033, "up"=>785441, "6-"=>6566, "pr"=>541620, "10"=>2210, "ku"=>392129, "ps"=>556687, "da"=>189020, "zo"=>823490, "pt"=>558010, "11"=>2525, "ur"=>787459, "60"=>6600, "pu"=>558096, "12"=>2691, "us"=>788002, "61"=>6654, "ia"=>343317, "13"=>2857, "ky"=>392202, "ut"=>788423, "62"=>6680, "ib"=>343385, "14"=>3023, "63"=>6706, "ic"=>343452, "15"=>3189, "de"=>191816, "na"=>456102, "o."=>482752, "64"=>6732, "py"=>562611, "id"=>344057, "16"=>3355, "uv"=>788962, "65"=>6786, "17"=>3521, "if"=>345034, "18"=>3687, "t-"=>685912, "ux"=>789057, "66"=>6840, "ig"=>345064, "19"=>3797, "sa"=>598455, "di"=>206077, "67"=>6866, "dj"=>220263, "uz"=>789122, "ne"=>459116, "zy"=>823837, "y-"=>820785, "68"=>6892, "sc"=>604552, "69"=>6918, "xa"=>819260, "ii"=>345421, "se"=>608988, "xc"=>819292, "ni"=>464278, "il"=>345474, "do"=>220299, "y2"=>820817, "xe"=>819571, "im"=>347578, "sh"=>623055, "in"=>353335, "si"=>629245, "io"=>380365, "dr"=>225168, "ip"=>380637, "sk"=>635321, "5t"=>6514, "xi"=>819747, "sl"=>636220, "ir"=>380672, "sm"=>639638, "du"=>227825, "no"=>466328, "is"=>382721, "sn"=>641766, "xl"=>819882, "it"=>383968, "so"=>643211, "dw"=>229873, "0"=>2160, "sp"=>649763, "4-"=>5389, "iv"=>384300, "1"=>2185, "ba"=>76418, "sq"=>656684, "dy"=>229940, "2"=>3934, "sr"=>658125, "nt"=>481308, "ix"=>384390, "9-"=>7800, "3"=>4645, "40"=>5460, "nu"=>481335, "4"=>5364, "41"=>5570, "ga"=>294211, "h-"=>313795, "5"=>6083, "st"=>658188, "42"=>5624, "90"=>7834, "6"=>6541, "be"=>83696, "su"=>671222, "43"=>5678, "91"=>7888, "7"=>6971, "sv"=>681720, "la"=>392328, "ny"=>482674, "92"=>7914, "8"=>7373, "sw"=>681768, "44"=>5732, "ge"=>296692, "xv"=>820134, "93"=>7940, "9"=>7775, "bh"=>89324, "45"=>5786, "r."=>566629, "94"=>7966, "bi"=>89359, "sy"=>683436, "46"=>5840, "qa"=>563600, "95"=>7992, "le"=>398566, "47"=>5894, "xx"=>820244, "96"=>8046, "48"=>5948, "w-"=>800937, "gh"=>299675, "97"=>8072, "bl"=>96334, "49"=>6002, "va"=>789214, "gi"=>299913, "98"=>8098, "li"=>402864, "99"=>8124, "bo"=>102434, "gl"=>300821, "ve"=>791850, "3r"=>5337, "gn"=>303067, "br"=>107474, "go"=>303237, "vi"=>795294, "lo"=>409658, "gr"=>305879, "bu"=>114508, "8t"=>7748, "2-"=>3959, "gu"=>312309, "a-"=>8177, "by"=>118813, "vo"=>799000, "a."=>8266, "lu"=>416153, "7-"=>6996, "20"=>3996, "lv"=>417909, "ea"=>230712, "gy"=>313409, "21"=>4106, "eb"=>232434, "lx"=>418019, "ec"=>232594, "70"=>7030, "qu"=>563668, "22"=>4160, "ly"=>418802, "ed"=>233599, "71"=>7084, "ja"=>384470, "23"=>4214, "vu"=>800557, "72"=>7110, "24"=>4268, "ee"=>234244, "73"=>7136, "25"=>4322, "ef"=>234377, "oa"=>482782, "p."=>505788, "74"=>7162, "26"=>4376, "eg"=>235115, "ob"=>482870, "75"=>7188, "je"=>385496, "27"=>4430, "u-"=>722199, "oc"=>484785, "76"=>7242, "28"=>4484, "ei"=>235526, "od"=>485595, "29"=>4538, "ta"=>685944, "oe"=>486011, "77"=>7268, "of"=>486090, "78"=>7294, "ji"=>386292, "ya"=>820854, "el"=>236340, "79"=>7320, "em"=>238831, "oh"=>487955, "te"=>690141, "en"=>240855, "oi"=>487986, "eo"=>245834, "ye"=>820972, "ok"=>488175, "1s"=>3907, "th"=>695473, "ep"=>245983, "ol"=>488233, "ti"=>702295, "eq"=>247606, "om"=>489492, "jo"=>386595, "er"=>248382, "on"=>489888, "6t"=>6944, "es"=>249587, "yi"=>822204, "et"=>250446, "oo"=>493273, "jr"=>387299, "eu"=>251131, "op"=>493333, "ev"=>252105, "to"=>705018, "or"=>495731, "c"=>118928, "ju"=>387326, "ex"=>253536, "5-"=>6108, "os"=>498199, "d"=>188965, "ca"=>118953, "ey"=>261468, "yo"=>822254, "ot"=>498747, "tr"=>709204, "d."=>188990, "ou"=>499048, "ts"=>717450, "50"=>6142, "ov"=>501567, "ha"=>313827, "cc"=>131547, "51"=>6252, "ow"=>505326, "cd"=>131600, "52"=>6278, "i"=>343292, "ce"=>131626, "tu"=>717516, "n-"=>456074, "53"=>6304, "ox"=>505512, "ma"=>419357, "54"=>6330, "yu"=>822690, "he"=>320515, "k"=>388815, "tw"=>719084, "l"=>392271, "s-"=>598423, "ch"=>134647, "55"=>6356, "ra"=>566657, "m"=>419332, "ci"=>142370, "ty"=>721720, "56"=>6410, "tz"=>722141, "me"=>429139, "57"=>6436, "58"=>6462, "x-"=>819228, "hi"=>327145, "cl"=>144311, "59"=>6488, "wa"=>800969, "re"=>571244, "mi"=>436715, "co"=>150272, "rh"=>587466, "we"=>804320, "hm"=>331476, "ri"=>588287, "u"=>722172, "v"=>789157, "cr"=>176976, "ho"=>331505, "wh"=>809316, "x"=>819203, "ct"=>183185, "mn"=>442650, "4t"=>6056, "wi"=>811263, "cu"=>183216, "mo"=>442756, "9t"=>8150, "cv"=>186983, "ro"=>591099, "a_"=>8294, "cx"=>187009, "3-"=>4670, "hu"=>337473, "cy"=>187229, "cz"=>188827, "wo"=>814766, "30"=>4741, "mu"=>450942, ".2"=>1740, "8-"=>7398, "ab"=>8553},
|
9
15
|
:verb => {"ox"=>317944, "ep"=>169705, "ki"=>261865, "ne"=>302629, "x-"=>522318, "oy"=>318231, "ru"=>392475, "ur"=>502622, "bu"=>56124, "eq"=>169837, "oz"=>318263, "us"=>502895, "aa"=>1740, "er"=>170250, "ut"=>503162, "ab"=>1767, "es"=>170652, "ho"=>232836, "ac"=>3529, "et"=>171392, "ni"=>303882, "ta"=>465047, "ad"=>6490, "da"=>118966, "by"=>61875, "eu"=>171927, "ae"=>8718, "ev"=>172155, "kn"=>263317, "af"=>8929, "ko"=>264585, "ag"=>9437, "ex"=>173105, "te"=>472676, "wa"=>508666, "ga"=>204003, "ey"=>178534, "hu"=>237591, "ai"=>10286, "de"=>120649, "no"=>304585, "th"=>475571, "ti"=>478689, "we"=>512699, "za"=>523414, "ge"=>206371, "ja"=>255802, "hy"=>239280, "al"=>10951, "di"=>136972, "kv"=>264661, "am"=>12800, "wh"=>514913, "an"=>13706, "dj"=>148986, "ze"=>523472, "wi"=>517343, "je"=>256873, "ma"=>280541, "nu"=>305772, "ap"=>16409, "gh"=>210633, "to"=>481065, "aq"=>18267, "gi"=>210790, "ar"=>18347, "zi"=>523585, "as"=>19764, "do"=>149014, "pa"=>318325, "qu"=>356569, "tr"=>483983, "at"=>21915, "gl"=>213421, "ji"=>257383, "me"=>287710, "ts"=>490986, "wo"=>519567, "au"=>23027, "av"=>23935, "dr"=>152421, "gn"=>215270, "g."=>203975, "aw"=>24513, "go"=>215409, "tu"=>491013, "wr"=>520951, "ax"=>24704, "mi"=>290558, "pe"=>324550, "sa"=>395891, "zo"=>523781, "ca"=>61939, "du"=>158291, "tw"=>493797, "az"=>24779, "gr"=>218976, "sc"=>398961, "dw"=>159135, "jo"=>257765, "ph"=>328230, "pi"=>328891, "se"=>403889, "va"=>503309, "ty"=>494840, "fa"=>178632, "dy"=>159383, "gu"=>222269, "ce"=>72643, "mo"=>295025, "pl"=>332016, "sh"=>410924, "si"=>417650, "ve"=>504664, "ya"=>522393, "ch"=>73529, "fe"=>183011, "gy"=>223294, "ju"=>258735, "ci"=>81247, "po"=>336199, "sk"=>421199, "ic"=>240465, "sl"=>422915, "id"=>240617, "sm"=>427144, "vi"=>506088, "ye"=>522895, "cl"=>82562, "la"=>264691, "mu"=>299509, "sn"=>428534, "fi"=>185556, "pr"=>341164, "so"=>430811, "ig"=>240935, "ps"=>350290, "sp"=>433886, "co"=>87668, "pt"=>350435, "sq"=>440723, "yi"=>523082, "fl"=>189489, "le"=>268258, "my"=>301203, "pu"=>350501, "ob"=>306433, "vo"=>507446, "cr"=>110051, "oc"=>307497, "fo"=>194383, "od"=>307842, "st"=>442042, "il"=>241065, "li"=>271717, "ra"=>358635, "py"=>356507, "su"=>454237, "vr"=>508439, "cu"=>115910, "im"=>241391, "of"=>307921, "yo"=>523290, "ba"=>24810, "fr"=>199363, "in"=>244059, "og"=>308488, "sw"=>460901, "io"=>254520, "vu"=>508468, "ft"=>202432, "oi"=>308518, "re"=>363091, "sy"=>463729, "ea"=>159598, "cy"=>118743, "fu"=>202461, "ir"=>254766, "ok"=>308584, "be"=>30231, "eb"=>160312, "lo"=>275662, "rh"=>386786, "ec"=>160555, "is"=>255120, "om"=>308616, "ri"=>386952, "ed"=>160739, "it"=>255504, "on"=>308689, "ha"=>223396, "ug"=>495123, "bi"=>37946, "o."=>306373, "ef"=>161223, "oo"=>308723, "xe"=>522360, "eg"=>161607, "op"=>308863, "bl"=>39622, "he"=>228710, "ka"=>259787, "lu"=>279432, "or"=>309814, "ro"=>389309, "ej"=>161771, "os"=>310691, "ul"=>495155, "bo"=>44472, "ek"=>161875, "um"=>495274, "el"=>161933, "ke"=>259848, "na"=>301427, "ly"=>280367, "ou"=>310976, "un"=>495308, "em"=>162861, "hi"=>231023, "ov"=>313178, "e-"=>159560, "br"=>48932, "en"=>164868, "ow"=>317836, "up"=>501838},
|
10
16
|
:adv => {"ul"=>146918, "sa"=>121999, "me"=>87949, "is"=>79726, "al"=>6456, "fu"=>54854, "ty"=>146844, "ro"=>121173, "op"=>100918, "it"=>79763, "am"=>8151, "ba"=>15505, "gi"=>56230, "va"=>156823, "un"=>147057, "sc"=>122613, "pe"=>105172, "an"=>8719, "cy"=>31883, "or"=>101364, "ea"=>40812, "i."=>64191, "up"=>155754, "se"=>123441, "os"=>101738, "mi"=>89536, "lu"=>85885, "eb"=>41346, "ap"=>10042, "bc"=>16615, "gl"=>56390, "ph"=>107333, "ot"=>101845, "ha"=>58556, "do"=>38706, "ec"=>41381, "ve"=>157238, "ur"=>156560, "pi"=>107876, "ou"=>101917, "o."=>96668, "ar"=>10688, "be"=>16664, "ed"=>41593, "us"=>156624, "sh"=>125358, "ov"=>102923, "go"=>56887, "as"=>11428, "ye"=>162287, "ut"=>156758, "si"=>126337, "ru"=>121807, "ow"=>103605, "ly"=>86271, "dr"=>39948, "ee"=>41743, "at"=>12932, "pl"=>108832, "na"=>92504, "he"=>60006, "ef"=>41771, "au"=>14641, "jo"=>80066, "vi"=>157846, "sk"=>127582, "bi"=>18176, "gr"=>57195, "eg"=>41998, "a."=>1802, "av"=>14980, "sl"=>127918, "pn"=>109745, "mo"=>90407, "ke"=>80998, "du"=>40494, "aw"=>15087, "yi"=>162508, "sm"=>128914, "po"=>109782, "ei"=>42035, "ax"=>15383, "ux"=>156789, "ta"=>138198, "sn"=>129359, "ne"=>92966, "gu"=>58424, "bl"=>18795, "so"=>129638, "ca"=>22029, "hi"=>61819, "wa"=>159164, "sp"=>132158, "dy"=>40699, "el"=>42063, "ju"=>80410, "ki"=>81028, "vo"=>158698, "sq"=>133295, "pr"=>111025, "em"=>42369, "fa"=>47339, "bo"=>19295, "te"=>138977, "ps"=>114778, "ni"=>94052, "mu"=>91756, "en"=>42630, "yo"=>162542, "e'"=>40734, "we"=>159590, "pu"=>114861, "p."=>103637, "ib"=>64217, "ep"=>43786, "br"=>19838, "ce"=>23083, "th"=>140092, "st"=>133477, "ic"=>64297, "eq"=>43822, "fe"=>48826, "cf"=>23412, "la"=>81356, "ho"=>62489, "ze"=>162691, "ti"=>142646, "su"=>135818, "my"=>92434, "id"=>64326, "er"=>43965, "kn"=>81226, "wh"=>160191, "vu"=>159098, "es"=>44216, "bu"=>20620, "ch"=>23461, "ie"=>64701, "wi"=>160770, "sw"=>137556, "py"=>115529, "ob"=>96696, "et"=>44370, "a_"=>1884, "b."=>15451, "ci"=>24468, "if"=>64725, "ra"=>116360, "oc"=>97499, "no"=>94335, "le"=>82562, "ig"=>64753, "eu"=>44719, "zi"=>162788, "sy"=>137715, "od"=>97535, "fi"=>49357, "e."=>40786, "ev"=>44758, "by"=>20853, "cl"=>24639, "hu"=>63384, "to"=>143107, "of"=>97605, "da"=>31998, "ex"=>45689, "ab"=>2343, "re"=>117290, "li"=>83328, "ac"=>3468, "fl"=>50352, "wo"=>161523, "ga"=>55429, "co"=>25439, "ad"=>4138, "tr"=>145364, "nu"=>96480, "hy"=>63857, "il"=>64855, "ae"=>5138, "rh"=>120391, "im"=>65172, "af"=>5207, "ja"=>79789, "fo"=>50969, "wr"=>162050, "ri"=>120500, "qu"=>115566, "ok"=>98535, "cr"=>30383, "de"=>32710, "in"=>67015, "ag"=>5747, "ma"=>86304, "'t"=>1740, "ah"=>6150, "ge"=>55713, "tu"=>146503, "om"=>98587, "ip"=>78991, "ai"=>6367, "fr"=>53467, "pa"=>103691, "on"=>98620, "lo"=>84939, "je"=>79886, "cu"=>31304, "tw"=>146716, "o'"=>96613, "di"=>35621, "ir"=>79023, "ak"=>6428, "c."=>21975}
|
11
17
|
}
|
18
|
+
|
19
|
+
# Hash object used for caching retreved terms to further improve retreval performance
|
12
20
|
WORDS_CACHE = Hash.new
|
13
21
|
|
14
|
-
|
22
|
+
## Returns the current connection status of the wordnet object.
|
23
|
+
#
|
24
|
+
# @return [true, false] The current connection status of the wordnet object.
|
25
|
+
attr_reader :connected
|
15
26
|
|
27
|
+
## Returns the current connection status of the wordnet object.
|
28
|
+
#
|
29
|
+
# @return [true, false] The current connection status of the wordnet object.
|
30
|
+
alias :connected? connected
|
31
|
+
|
32
|
+
# Returns the type of the current wordnet connection.
|
33
|
+
#
|
34
|
+
# @return [Symbol] The current wordnet connection type. Currently supported :pure & :tokyo.
|
35
|
+
attr_reader :connection_type
|
36
|
+
|
37
|
+
# Returns the datapath currently in use (this may be irrelevent when using the pure connector and thus could be nil.)
|
38
|
+
#
|
39
|
+
# @return [Pathname, nil] The path to the data directory currently in use. Returns nil if unknown.
|
40
|
+
attr_reader :data_path
|
41
|
+
|
42
|
+
# Returns the path to the wordnet collection currently in use (this may be irrelevent when using the tokyo connector and thus could be nil.)
|
43
|
+
#
|
44
|
+
# @return [Pathname, nil] The path to the wordnet collection currently in use. Returns nil if unknown.
|
45
|
+
attr_reader :wordnet_path
|
46
|
+
|
47
|
+
# Constructs a new pure ruby connector for use with the words wordnet class.
|
48
|
+
#
|
49
|
+
# @param [Pathname] data_path Specifies the directory within which constructed datasets can be found (evocations etc...)
|
50
|
+
# @param [Pathname] wordnet_path Specifies the directory within which the wordnet dictionary can be found.
|
51
|
+
# @return [PureWordnetConnection] A new wordnet connection.
|
52
|
+
# @raise [BadWordnetConnector] If an invalid connector type is provided.
|
16
53
|
def initialize(data_path, wordnet_path)
|
17
54
|
|
18
55
|
@data_path, @wordnet_path, @connection_type, @connected = data_path, wordnet_path, :pure, false
|
@@ -21,6 +58,9 @@ module Words
|
|
21
58
|
|
22
59
|
end
|
23
60
|
|
61
|
+
# Causes the connection specified within the wordnet object to be reopened if currently closed.
|
62
|
+
#
|
63
|
+
# @raise [BadWordnetConnector] If an invalid connector type is provided.
|
24
64
|
def open!
|
25
65
|
|
26
66
|
raise BadWordnetDataset, "Failed to locate the wordnet database. Please ensure it is installed and that if it resides at a custom path that path is given as an argument when constructing the Words object." if @wordnet_path.nil?
|
@@ -36,6 +76,8 @@ module Words
|
|
36
76
|
|
37
77
|
end
|
38
78
|
|
79
|
+
# Causes the current connection to wordnet to be closed.
|
80
|
+
#
|
39
81
|
def close!
|
40
82
|
|
41
83
|
@connected = false
|
@@ -43,56 +85,29 @@ module Words
|
|
43
85
|
|
44
86
|
end
|
45
87
|
|
46
|
-
#
|
47
|
-
|
88
|
+
# Locates from a term any relevent homographs and constructs a homographs hash.
|
89
|
+
#
|
90
|
+
# @param [String] term The specific term that is desired from within wordnet.
|
91
|
+
# @param [true, false] use_cache Specify whether to use caching when finding and retreving terms.
|
92
|
+
# @result [Hash, nil] A hash in the format { 'lemma' => ..., 'tagsense_counts' => ..., 'synset_ids' => ... }, or nil if no homographs are available.
|
93
|
+
# @raise [NoWordnetConnection] If there is currently no wordnet connection.
|
48
94
|
def homographs(term, use_cache = true)
|
49
95
|
|
50
96
|
raise NoWordnetConnection, "There is presently no connection to wordnet. To attempt to reistablish a connection you should use the 'open!' command on the Wordnet object." unless connected?
|
51
97
|
|
52
|
-
#
|
53
|
-
term
|
54
|
-
|
55
|
-
# identify the term initials
|
56
|
-
term_initials = term[0,2]
|
57
|
-
|
58
|
-
# for each index we have
|
59
|
-
INDEXES.keys.each do |index_pos|
|
60
|
-
next unless INDEXES[index_pos].include? term_initials # if the index does not contain the desired word skip the index
|
61
|
-
file = File.new(@wordnet_path + "index.#{index_pos}") # open wordnet index file
|
62
|
-
file.seek INDEXES[index_pos][term_initials] # seek to the index starting point
|
63
|
-
|
64
|
-
while (line = file.gets) && (term_initials == line[0,2]) # break if line if EOF or we are past the term and thus the line doesnt start with the term initials
|
65
|
-
lemma, pos, *index_parts = line.split(' ') # split the line and split off the lemma
|
66
|
-
if (lemma == term || use_cache) # if it's the term we are after or we are using cache then we save the word
|
67
|
-
WORDS_CACHE[lemma] ||= [ lemma ] # ensure that there is datastructure to hold our word information
|
68
|
-
if !WORDS_CACHE[lemma].include?(index_pos) # unless there already exists an entry for said word associated with the current index
|
69
|
-
tagsense_count, *synset_offsets = index_parts.slice(index_parts[1].to_i+3..-1) # seperate out what is useful from the index as a whole
|
70
|
-
WORDS_CACHE[lemma] += [ pos, tagsense_count.to_i, synset_offsets ] # add the tagsense_count and the synsets for the pos
|
71
|
-
break if lemma == term # if we have the word in this index then we can jump out and check the next
|
72
|
-
end
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
file.close # close wordnet index file
|
77
|
-
end unless WORDS_CACHE.include?(term) && use_cache # if we have the term already and are ok with using cache then simply use that!
|
98
|
+
# Ensure that the term is either in the cache. If not, locate and add it if possable.
|
99
|
+
cache_ensure_from_wordnet(term, use_cache)
|
78
100
|
|
79
|
-
#
|
80
|
-
|
81
|
-
unless raw_homographs.empty? # if we have something... format it
|
82
|
-
tagsense_counts = Array.new
|
83
|
-
synset_ids = Array.new
|
84
|
-
while !raw_homographs.empty?
|
85
|
-
pos = raw_homographs.shift
|
86
|
-
tagsense_counts << "#{pos}#{raw_homographs.shift}"
|
87
|
-
synset_ids += raw_homographs.shift.map { |sense_offset| "#{pos}#{sense_offset}" }
|
88
|
-
end
|
89
|
-
return { 'lemma' => lemma, 'tagsense_counts' => tagsense_counts.join('|'), 'synset_ids' => synset_ids.join('|') }
|
90
|
-
else
|
91
|
-
return nil # we return nil if we haven't found the term
|
92
|
-
end
|
101
|
+
# We should either have the word in cache now or nowt... we should now change that into homograph input format (we do this here to improve performance during the cacheing performed above)
|
102
|
+
cached_entry_to_homograph_hash(term)
|
93
103
|
|
94
104
|
end
|
95
105
|
|
106
|
+
# Locates from a synset_id a specific synset and constructs a synset hash.
|
107
|
+
#
|
108
|
+
# @param [String] synset_id The synset id to locate.
|
109
|
+
# @result [Hash, nil] A hash in the format { "synset_id" => ..., "lexical_filenum" => ..., "synset_type" => ..., "words" => ..., "relations" => ..., "gloss" => ... }, or nil if no synset is available.
|
110
|
+
# @raise [NoWordnetConnection] If there is currently no wordnet connection.
|
96
111
|
def synset(synset_id)
|
97
112
|
|
98
113
|
raise NoWordnetConnection, "There is presently no connection to wordnet. To attempt to reistablish a connection you should use the 'open!' command on the Wordnet object." unless connected?
|
@@ -109,18 +124,27 @@ module Words
|
|
109
124
|
|
110
125
|
end
|
111
126
|
|
127
|
+
# Returns wheter evocations are currently avalable to use with the current wordnet object. (More information on setting these up can be found within the README)
|
128
|
+
#
|
129
|
+
# @return [true, false] Whether evocations are currently available or not.
|
112
130
|
def evocations?
|
113
131
|
|
114
132
|
!evocations('n08112402').nil?
|
115
133
|
|
116
134
|
end
|
117
135
|
|
118
|
-
|
136
|
+
# Locates from a synset id any relevent evocations and constructs an evocations hash.
|
137
|
+
#
|
138
|
+
# @see Synset
|
139
|
+
# @param [String] senset_id The id number of a specific synset.
|
140
|
+
# @result [Hash, nil] A hash in the format { 'relations' => ..., 'means' => ..., 'medians' => ... }, or nil if no evocations are available.
|
141
|
+
# @raise [NoWordnetConnection] If there is currently no wordnet connection.
|
142
|
+
def evocations(synset_id)
|
119
143
|
|
120
144
|
raise NoWordnetConnection, "There is presently no connection to wordnet. To attempt to reistablish a connection you should use the 'open!' command on the Wordnet object." unless connected?
|
121
145
|
|
122
146
|
if defined? @evocations
|
123
|
-
raw_evocations = @evocations[
|
147
|
+
raw_evocations = @evocations[synset_id + "s"]
|
124
148
|
{ 'relations' => raw_evocations[0], 'means' => raw_evocations[1], 'medians' => raw_evocations[2]} unless raw_evocations.nil?
|
125
149
|
else
|
126
150
|
nil
|
@@ -128,15 +152,73 @@ module Words
|
|
128
152
|
|
129
153
|
end
|
130
154
|
|
131
|
-
|
155
|
+
# Provides a textural description of the current connection state of the Wordnet object.
|
156
|
+
#
|
157
|
+
# @return [String] A textural description of the current connection state of the Wordnet object. e.g. "Words not Connected" or "Words running in pure mode using wordnet files found at /opt/wordnet"
|
132
158
|
def to_s
|
133
159
|
|
134
160
|
"Words running in pure mode using wordnet files found at #{wordnet_path}"
|
135
161
|
|
136
162
|
end
|
137
163
|
|
138
|
-
|
139
|
-
|
164
|
+
private
|
165
|
+
|
166
|
+
def cache_ensure_from_wordnet(term, use_cache)
|
167
|
+
|
168
|
+
# clean up the term
|
169
|
+
term = term.gsub(" ", "_").downcase
|
170
|
+
|
171
|
+
# identify the term initials
|
172
|
+
term_initials = term[0,2]
|
173
|
+
|
174
|
+
# for each index we have
|
175
|
+
INDEXES.keys.each do |index_pos|
|
176
|
+
next unless INDEXES[index_pos].include? term_initials # if the index does not contain the desired word skip the index
|
177
|
+
file = File.new(@wordnet_path + "index.#{index_pos}") # open wordnet index file
|
178
|
+
file.seek INDEXES[index_pos][term_initials] # seek to the index starting point
|
179
|
+
|
180
|
+
while (line = file.gets) && (term_initials == line[0,2]) # break if line if EOF or we are past the term and thus the line doesnt start with the term initials
|
181
|
+
break if construct_cache_item(line, term, use_cache, index_pos)
|
182
|
+
end
|
183
|
+
|
184
|
+
file.close # close wordnet index file
|
185
|
+
end unless WORDS_CACHE.include?(term) && use_cache # if we have the term already and are ok with using cache then simply use that!
|
186
|
+
|
187
|
+
end
|
188
|
+
|
189
|
+
def construct_cache_item(line, term, use_cache, index_pos)
|
190
|
+
|
191
|
+
lemma, pos, *index_parts = line.split(' ') # split the line and split off the lemma
|
192
|
+
if (lemma == term || use_cache) # if it's the term we are after or we are using cache then we save the word
|
193
|
+
WORDS_CACHE[lemma] ||= [ lemma ] # ensure that there is datastructure to hold our word information
|
194
|
+
if !WORDS_CACHE[lemma].include?(index_pos) # unless there already exists an entry for said word associated with the current index
|
195
|
+
tagsense_count, *synset_offsets = index_parts.slice(index_parts[1].to_i+3..-1) # seperate out what is useful from the index as a whole
|
196
|
+
WORDS_CACHE[lemma] += [ pos, tagsense_count.to_i, synset_offsets ] # add the tagsense_count and the synsets for the pos
|
197
|
+
return true if lemma == term # if we have the word in this index then we can jump out and check the next index
|
198
|
+
end
|
199
|
+
end
|
200
|
+
return false
|
201
|
+
|
202
|
+
end
|
203
|
+
|
204
|
+
def cached_entry_to_homograph_hash(term)
|
205
|
+
|
206
|
+
lemma, *raw_homographs = WORDS_CACHE[term] # split the homograph
|
207
|
+
unless raw_homographs.empty? # if we have something... format it
|
208
|
+
tagsense_counts = Array.new
|
209
|
+
synset_ids = Array.new
|
210
|
+
while !raw_homographs.empty?
|
211
|
+
pos = raw_homographs.shift
|
212
|
+
tagsense_counts << "#{pos}#{raw_homographs.shift}"
|
213
|
+
synset_ids += raw_homographs.shift.map { |sense_offset| "#{pos}#{sense_offset}" }
|
214
|
+
end
|
215
|
+
return { 'lemma' => lemma, 'tagsense_counts' => tagsense_counts.join('|'), 'synset_ids' => synset_ids.join('|') }
|
216
|
+
else
|
217
|
+
return nil # we return nil if we haven't found the term
|
218
|
+
end
|
219
|
+
|
220
|
+
end
|
221
|
+
|
140
222
|
end
|
141
223
|
|
142
224
|
end
|
@@ -1,16 +1,48 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
1
3
|
# gem includes
|
2
4
|
require 'rubygems'
|
3
5
|
require 'rufus-tokyo' if Gem.available?('rufus-tokyo')
|
4
6
|
|
5
7
|
module Words
|
6
8
|
|
9
|
+
# Provides a pure tokyo cabinate connector to the Wordnet dataset.
|
7
10
|
class TokyoWordnetConnection
|
8
11
|
|
9
|
-
|
10
|
-
|
12
|
+
## Returns the current connection status of the wordnet object.
|
13
|
+
#
|
14
|
+
# @return [true, false] The current connection status of the wordnet object.
|
15
|
+
attr_reader :connected
|
16
|
+
|
17
|
+
## Returns the current connection status of the wordnet object.
|
18
|
+
#
|
19
|
+
# @return [true, false] The current connection status of the wordnet object.
|
20
|
+
alias :connected? connected
|
21
|
+
|
22
|
+
# Returns the type of the current wordnet connection.
|
23
|
+
#
|
24
|
+
# @return [Symbol] The current wordnet connection type. Currently supported :pure & :tokyo.
|
25
|
+
attr_reader :connection_type
|
26
|
+
|
27
|
+
# Returns the datapath currently in use (this may be irrelevent when using the pure connector and thus could be nil.)
|
28
|
+
#
|
29
|
+
# @return [Pathname, nil] The path to the data directory currently in use. Returns nil if unknown.
|
30
|
+
attr_reader :data_path
|
31
|
+
|
32
|
+
# Returns the path to the wordnet collection currently in use (this may be irrelevent when using the tokyo connector and thus could be nil.)
|
33
|
+
#
|
34
|
+
# @return [Pathname, nil] The path to the wordnet collection currently in use. Returns nil if unknown.
|
35
|
+
attr_reader :wordnet_path
|
36
|
+
|
37
|
+
# Constructs a new tokyo ruby connector for use with the words wordnet class.
|
38
|
+
#
|
39
|
+
# @param [Pathname] data_path Specifies the directory within which constructed datasets can be found (tokyo index, evocations etc...)
|
40
|
+
# @param [Pathname] wordnet_path Specifies the directory within which the wordnet dictionary can be found.
|
41
|
+
# @return [PureWordnetConnection] A new wordnet connection.
|
42
|
+
# @raise [BadWordnetConnector] If an invalid connector type is provided.
|
11
43
|
def initialize(data_path, wordnet_path)
|
12
44
|
|
13
|
-
@data_path, @wordnet_path, @connection_type, @connected = data_path, wordnet_path, :tokyo, false
|
45
|
+
@data_path, @wordnet_path, @connection_type, @connected = data_path + 'wordnet.tct', wordnet_path, :tokyo, false
|
14
46
|
|
15
47
|
# ensure we have the rufus gem loaded, else there is little point in continuing...
|
16
48
|
raise BadWordnetConnector, "Coulden't find the rufus-tokyo gem. Please ensure it's installed." unless Gem.available?('rufus-tokyo')
|
@@ -19,20 +51,26 @@ module Words
|
|
19
51
|
|
20
52
|
end
|
21
53
|
|
54
|
+
# Causes the connection specified within the wordnet object to be reopened if currently closed.
|
55
|
+
#
|
56
|
+
# @raise [BadWordnetConnector] If an invalid connector type is provided.
|
22
57
|
def open!
|
23
58
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
59
|
+
unless connected?
|
60
|
+
if @data_path.exist?
|
61
|
+
@connection = Rufus::Tokyo::Table.new(@data_path.to_s, :mode => 'r')
|
62
|
+
@connected = true
|
63
|
+
else
|
64
|
+
@connected = false
|
65
|
+
raise BadWordnetDataset, "Failed to locate the tokyo words dataset at #{@data_path}. Please insure you have created it using the words gems provided 'build_wordnet' command."
|
66
|
+
end
|
31
67
|
end
|
32
68
|
return nil
|
33
69
|
|
34
70
|
end
|
35
71
|
|
72
|
+
# Causes the current connection to wordnet to be closed.
|
73
|
+
#
|
36
74
|
def close!
|
37
75
|
|
38
76
|
if connected?
|
@@ -43,15 +81,23 @@ module Words
|
|
43
81
|
|
44
82
|
end
|
45
83
|
|
46
|
-
#
|
47
|
-
|
84
|
+
# Locates from a term any relevent homographs and constructs a homographs hash.
|
85
|
+
#
|
86
|
+
# @param [String] term The specific term that is desired from within wordnet.
|
87
|
+
# @result [Hash, nil] A hash in the format { 'lemma' => ..., 'tagsense_counts' => ..., 'synset_ids' => ... }, or nil if no homographs are available.
|
88
|
+
# @raise [NoWordnetConnection] If there is currently no wordnet connection.
|
48
89
|
def homographs(term)
|
49
90
|
|
50
91
|
raise NoWordnetConnection, "There is presently no connection to wordnet. To attempt to reistablish a connection you should use the 'open!' command on the Wordnet object." unless connected?
|
51
92
|
@connection[term]
|
52
93
|
|
53
94
|
end
|
54
|
-
|
95
|
+
|
96
|
+
# Locates from a synset_id a specific synset and constructs a synset hash.
|
97
|
+
#
|
98
|
+
# @param [String] synset_id The synset id to locate.
|
99
|
+
# @result [Hash, nil] A hash in the format { "synset_id" => ..., "lexical_filenum" => ..., "synset_type" => ..., "words" => ..., "relations" => ..., "gloss" => ... }, or nil if no synset is available.
|
100
|
+
# @raise [NoWordnetConnection] If there is currently no wordnet connection.
|
55
101
|
def synset(synset_id)
|
56
102
|
|
57
103
|
raise NoWordnetConnection, "There is presently no connection to wordnet. To attempt to reistablish a connection you should use the 'open!' command on the Wordnet object." unless connected?
|
@@ -59,27 +105,37 @@ module Words
|
|
59
105
|
|
60
106
|
end
|
61
107
|
|
108
|
+
# Returns wheter evocations are currently avalable to use with the current wordnet object. (More information on setting these up can be found within the README)
|
109
|
+
#
|
110
|
+
# @return [true, false] Whether evocations are currently available or not.
|
62
111
|
def evocations?
|
63
112
|
|
64
113
|
!evocations('n08112402').nil?
|
65
114
|
|
66
115
|
end
|
67
116
|
|
68
|
-
|
117
|
+
# Locates from a synset id any relevent evocations and constructs an evocations hash.
|
118
|
+
#
|
119
|
+
# @see Synset
|
120
|
+
# @param [String] senset_id The id number of a specific synset.
|
121
|
+
# @result [Hash, nil] A hash in the format { 'relations' => ..., 'means' => ..., 'medians' => ... }, or nil if no evocations are available.
|
122
|
+
# @raise [NoWordnetConnection] If there is currently no wordnet connection.
|
123
|
+
def evocations(synset_id)
|
69
124
|
|
70
125
|
raise NoWordnetConnection, "There is presently no connection to wordnet. To attempt to reistablish a connection you should use the 'open!' command on the Wordnet object." unless connected?
|
71
|
-
@connection[
|
126
|
+
@connection[synset_id + "s"]
|
72
127
|
|
73
128
|
end
|
74
129
|
|
130
|
+
# Provides a textural description of the current connection state of the Wordnet object.
|
131
|
+
#
|
132
|
+
# @return [String] A textural description of the current connection state of the Wordnet object. e.g. "Words not Connected" or "Words running in tokyo mode with dataset at /opt/wordnet"
|
75
133
|
def to_s
|
76
134
|
|
77
135
|
"Words running in tokyo mode with dataset at #{@dataset_path}"
|
78
136
|
|
79
137
|
end
|
80
138
|
|
81
|
-
alias connected? connected
|
82
|
-
|
83
139
|
end
|
84
140
|
|
85
141
|
end
|
data/lib/words.rb
CHANGED
@@ -1,49 +1,67 @@
|
|
1
|
-
#
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
# Standard library includes
|
2
4
|
require 'pathname'
|
3
5
|
|
4
|
-
#
|
6
|
+
# Local includes
|
5
7
|
require File.join(File.dirname(__FILE__),'homographs.rb')
|
6
8
|
|
9
|
+
# The Words gem namespace. Within this we offer a number of classes to facilitate useful interaction with words and language. Currently this largly consists of Words::Wordnet which offers simple wordnet access.
|
7
10
|
module Words
|
8
11
|
|
9
|
-
#
|
12
|
+
# We identify each wordnet connector installed and there paths
|
10
13
|
SUPPORTED_CONNECTIORS = Dir[File.join(File.dirname(__FILE__),'wordnet_connectors','*_wordnet_connection.rb')].inject(Hash.new) { |connectors, connection_file| connectors[ File.basename(connection_file).split('_').first.to_sym ] = connection_file; connectors }
|
14
|
+
# An array of tippical wordnet install locations (if you have a standard install somewhere else please open as an issue in github so we can improve!)
|
11
15
|
DEFAULT_WORDNET_LOCATIONS = ['/usr/share/wordnet', '/usr/local/share/wordnet', '/usr/local/WordNet-3.0', '/opt/WordNet-3.0', '/opt/wordnet', '/opt/local/share/WordNet-3.0/']
|
12
16
|
|
13
|
-
#
|
17
|
+
# Exception to indicate that the wordnet connector specified is not currently available/supported.
|
14
18
|
class BadWordnetConnector < RuntimeError; end
|
19
|
+
# Exception to indicate that there is a problem connecting to a specified wordnet dataset.
|
15
20
|
class BadWordnetDataset < RuntimeError; end
|
21
|
+
# Exception to indicate that there is not currently a connection to wordnet and thus any request cannot be fulfilled.
|
16
22
|
class NoWordnetConnection < RuntimeError; end
|
17
23
|
|
18
|
-
#
|
24
|
+
# The wordnet class provides a control come interface for interaction with the wordnet dataset of your choice. It creates a connection, based on specified paramaters, to a wordnet dataset and provides
|
25
|
+
# the means to interigate that dataset. In addition it provides control and information about that wordnet connection.
|
19
26
|
class Wordnet
|
20
27
|
|
28
|
+
## Returns the underlying wordnet connection object.
|
29
|
+
#
|
30
|
+
# @return [PureWordnetConnection, TokyoWordnetConnection] the underlying wordnet connection object.
|
21
31
|
attr_reader :wordnet_connection
|
22
|
-
|
32
|
+
|
33
|
+
# Constructs a new wordnet connection object.
|
34
|
+
#
|
35
|
+
# @param [Symbol] connector_type Specifies the connector type or mode desired. Current supported connectors are :pure and :tokyo.
|
36
|
+
# @param [String, Symbol] wordnet_path Specifies the directory within which the wordnet dictionary can be found. It can be set to :search to attempt to locate wordnet automatically.
|
37
|
+
# @param [String, Symbol] data_path Specifies the directory within which constructed datasets can be found (tokyo index, evocations etc...) It can be set to :default to use the standard location inside the gem directory.
|
38
|
+
# @return [Wordnet] The wordnet connection object.
|
39
|
+
# @raise [BadWordnetConnector] If an invalid connector type is provided.
|
23
40
|
def initialize(connector_type = :pure, wordnet_path = :search, data_path = :default)
|
24
41
|
|
25
|
-
#
|
42
|
+
# Check and specify useful paths
|
26
43
|
wordnet_path = Wordnet::locate_wordnet(wordnet_path)
|
27
44
|
data_path = (data_path == :default ? Pathname.new(File.join(File.dirname(__FILE__), '..', 'data')) : Pathname.new( data_path ))
|
28
45
|
|
29
|
-
#
|
46
|
+
# Ensure we have a valid connector type
|
30
47
|
raise BadWordnetConnector, "You specified an unsupported wordnet connector type. Supported connectors are: #{SUPPORTED_CONNECTIORS}" unless SUPPORTED_CONNECTIORS.include? connector_type
|
31
48
|
|
32
|
-
#
|
33
|
-
|
49
|
+
# We can assume that the disired connector is now available
|
50
|
+
desired_connector = SUPPORTED_CONNECTIORS[connector_type]
|
51
|
+
|
52
|
+
# Assuming we have a valid connection type we can import the relevant code (the reason we do this dynamically is to reduce loadtime)
|
53
|
+
require desired_connector
|
34
54
|
|
35
|
-
#
|
36
|
-
@wordnet_connection = Words.const_get( File.basename(
|
55
|
+
# Construct the connector object
|
56
|
+
@wordnet_connection = Words.const_get( File.basename(desired_connector, '.rb').gsub(/(^|_)(.)/) { $2.upcase } ).new(data_path, wordnet_path)
|
37
57
|
|
38
|
-
# construct some conveniance menthods for relation type access
|
39
|
-
[:connection_type, :wordnet_path, :data_path, :close!, :open!, :connected?, :evocations?].each do |method_name|
|
40
|
-
self.class.send(:define_method, method_name) do
|
41
|
-
@wordnet_connection.send method_name if defined? @wordnet_connection
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
58
|
end
|
46
59
|
|
60
|
+
# Locates the set of homographs within wordnet specific to the term entered.
|
61
|
+
#
|
62
|
+
# @param [String] term The specific term that is desired from within wordnet. This is caps insensative & we do a small amount of cleanup.
|
63
|
+
# @return [Homographs] An object encaptulating the homographs of the desired term. If the term cannot be located within wordnet then nil is returned.
|
64
|
+
# @raise [NoWordnetConnection] If there is currently no wordnet connection.
|
47
65
|
def find(term)
|
48
66
|
|
49
67
|
raise NoWordnetConnection, "There is presently no connection to wordnet. To attempt to reistablish a connection you should use the 'open!' command on the Wordnet object." unless connected?
|
@@ -51,23 +69,91 @@ module Words
|
|
51
69
|
Homographs.new(homographs, @wordnet_connection) unless homographs.nil?
|
52
70
|
|
53
71
|
end
|
54
|
-
|
72
|
+
|
73
|
+
# Returns the type of the current wordnet connection.
|
74
|
+
#
|
75
|
+
# @return [Symbol] The current wordnet connection type. Currently supported :pure & :tokyo.
|
76
|
+
def connection_type
|
77
|
+
|
78
|
+
@wordnet_connection.connection_type
|
79
|
+
|
80
|
+
end
|
81
|
+
|
82
|
+
# Returns the path to the wordnet collection currently in use (this may be irrelevent when using the tokyo connector and thus could be nil.)
|
83
|
+
#
|
84
|
+
# @return [Pathname, nil] The path to the wordnet collection currently in use. Returns nil if unknown.
|
85
|
+
def wordnet_path
|
86
|
+
|
87
|
+
@wordnet_connection.wordnet_path
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
# Returns the datapath currently in use (this may be irrelevent when using the pure connector and thus could be nil.)
|
92
|
+
#
|
93
|
+
# @return [Pathname, nil] The path to the data directory currently in use. Returns nil if unknown.
|
94
|
+
def data_path
|
95
|
+
|
96
|
+
@wordnet_connection.data_path
|
97
|
+
|
98
|
+
end
|
99
|
+
|
100
|
+
# Causes the current connection to wordnet to be closed.
|
101
|
+
#
|
102
|
+
def close!
|
103
|
+
|
104
|
+
@wordnet_connection.close!
|
105
|
+
|
106
|
+
end
|
107
|
+
|
108
|
+
# Causes the connection specified within the wordnet object to be reopened if currently closed.
|
109
|
+
#
|
110
|
+
def open!
|
111
|
+
|
112
|
+
@wordnet_connection.open!
|
113
|
+
|
114
|
+
end
|
115
|
+
|
116
|
+
# Returns the current connection status of the wordnet object.
|
117
|
+
#
|
118
|
+
# @return [true, false] The current connection status of the wordnet object.
|
119
|
+
def connected?
|
120
|
+
|
121
|
+
@wordnet_connection.connected?
|
122
|
+
|
123
|
+
end
|
124
|
+
|
125
|
+
# Returns wheter evocations are currently avalable to use with the current wordnet object. (More information on setting these up can be found within the README)
|
126
|
+
#
|
127
|
+
# @return [true, false] Whether evocations are currently available or not.
|
128
|
+
def evocations?
|
129
|
+
|
130
|
+
@wordnet_connection.evocations?
|
131
|
+
|
132
|
+
end
|
133
|
+
|
134
|
+
# Provides a textural description of the current connection state of the Wordnet object.
|
135
|
+
#
|
136
|
+
# @return [String] A textural description of the current connection state of the Wordnet object. e.g. "Words not Connected" or "Words running in pure mode using wordnet files found at /opt/wordnet"
|
55
137
|
def to_s
|
56
138
|
|
57
|
-
#
|
139
|
+
# Return a description of the connector
|
58
140
|
!connected? ? "Words not connected" : @wordnet_connection.to_s
|
59
141
|
|
60
142
|
end
|
61
143
|
|
62
144
|
private
|
63
145
|
|
146
|
+
# Attempts to locates wordnet given an array of directories to look within
|
147
|
+
#
|
148
|
+
# @param [String, Array<String>, Symbol] base_dirs Either a path, array of or the :search symbol. Will attempt to locate wordnet within these specified directories.
|
149
|
+
# @return [Pathname, nil] The pathname of the wordnet dictionary files or nil if they can't be located within the passed directorie(s)
|
64
150
|
def self.locate_wordnet(base_dirs)
|
65
151
|
|
66
152
|
base_dirs = case base_dirs
|
67
153
|
when :search
|
68
154
|
DEFAULT_WORDNET_LOCATIONS
|
69
155
|
else
|
70
|
-
[ base_dirs ]
|
156
|
+
[ base_dirs ].flatten
|
71
157
|
end
|
72
158
|
|
73
159
|
base_dirs.each do |dir|
|