words-wordnet 0.4.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +20 -0
- data/README.markdown +169 -0
- data/Rakefile +54 -0
- data/VERSION +1 -0
- data/bin/build_wordnet +177 -0
- data/examples.rb +55 -0
- data/lib/evocations.rb +81 -0
- data/lib/homographs.rb +100 -0
- data/lib/relation.rb +90 -0
- data/lib/synset.rb +201 -0
- data/lib/wordnet_connectors/pure_wordnet_connection.rb +224 -0
- data/lib/wordnet_connectors/tokyo_wordnet_connection.rb +141 -0
- data/lib/words.rb +172 -0
- data/spec/words_spec.rb +151 -0
- data/words.gemspec +57 -0
- metadata +95 -0
@@ -0,0 +1,224 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module Words
|
4
|
+
|
5
|
+
# Provides a pure ruby connector to the Wordnet dataset.
|
6
|
+
class PureWordnetConnection
|
7
|
+
|
8
|
+
# Convert single letter POS to it's multi-letter equivilent
|
9
|
+
SHORT_TO_POS_FILE_TYPE = { 'a' => 'adj', 'r' => 'adv', 'n' => 'noun', 'v' => 'verb' }
|
10
|
+
|
11
|
+
# Set of indexes for seeking directly into wordnet files to identify terms with significantly improved performance
|
12
|
+
INDEXES = {
|
13
|
+
:noun => {"mv"=>2908615, "fa"=>1455677, "g-"=>1695451, "hy"=>2196287, "ac"=>21116, "wr"=>4743086, "rt"=>3724403, "k_"=>2405676, "mw"=>2908680, "fb"=>1539515, "g."=>1695573, "hz"=>2219696, "ad"=>48269, "ws"=>4747643, "ru"=>3724431, "mx"=>2908742, "fc"=>1539583, "80"=>6057, "ae"=>63445, "wt"=>4747670, "rv"=>3740230, "ka"=>2405742, "l-"=>2459655, "my"=>2908771, "fd"=>1539637, "af"=>68288, "wu"=>4747756, "rw"=>3740258, "kb"=>2417524, "l."=>2459745, "fe"=>1539722, "ag"=>74279, "wv"=>4748078, "kc"=>2417632, "ah"=>83260, "ry"=>3740424, "pa"=>3143343, "36"=>5141, "ai"=>83677, "ww"=>4748110, "pb"=>3211047, "ke"=>2417664, "aj"=>91267, "v-"=>4545234, "pc"=>3211172, "fh"=>1559167, "ak"=>91562, "wy"=>4748137, "v."=>4545387, "ua"=>4496561, "pd"=>3211308, "kg"=>2427122, "fi"=>1559226, "al"=>92464, "ub"=>4496594, "pe"=>3211419, "2n"=>4947, "fj"=>1596225, "am"=>130827, "kh"=>2427183, "uc"=>4496797, "pf"=>3263095, "88"=>6083, "an"=>154839, "ki"=>2428739, "za"=>4773142, "ud"=>4496830, "fl"=>1596256, "ao"=>203539, "zb"=>4775763, "ph"=>3263286, "fm"=>1622351, "ap"=>204006, "uf"=>4496962, "pi"=>3293279, "fn"=>1622416, "aq"=>218174, "kk"=>2442519, "zd"=>4775847, "ug"=>4497019, "pj"=>3328895, "fo"=>1622444, "ar"=>219963, "kl"=>2442551, "ze"=>4775874, "uh"=>4497483, "pk"=>3328925, "fp"=>1650875, "as"=>262743, "km"=>2443913, "ui"=>4497543, "pl"=>3329011, "at"=>282628, "kn"=>2443973, "pm"=>3357376, "fr"=>1650935, "au"=>299805, "ko"=>2448754, "zh"=>4778739, "uk"=>4497767, "pn"=>3357459, "fs"=>1681993, "av"=>316371, "kp"=>2453337, "zi"=>4778934, "ul"=>4498102, "po"=>3358476, "ft"=>1682056, "aw"=>319552, "um"=>4501463, "fu"=>1682252, "ax"=>320182, "1-"=>1892, "kr"=>2453390, "un"=>4503199, "pp"=>3416671, "ay"=>321448, "ks"=>2455025, "zl"=>4782157, "fw"=>1695021, "az"=>322115, "kt"=>2455090, "d_"=>1083112, "up"=>4528358, "pr"=>3416755, "o'"=>3029255, "10"=>1959, "ku"=>2455116, "zn"=>4782189, "ps"=>3483993, "e-"=>1289529, "fy"=>1695051, "kv"=>2458073, "da"=>1083181, "zo"=>4782217, "ur"=>4532258, "pt"=>3492829, "i_"=>2220034, "11"=>2218, "kw"=>2458174, "db"=>1108193, "e."=>1289664, "us"=>4538820, "t'"=>4259996, "pu"=>3496345, "60"=>5843, "12"=>2315, "dc"=>1108287, "ut"=>4542211, "pv"=>3517927, "ia"=>2220399, "13"=>2445, "ky"=>2458844, "zr"=>4784927, "pw"=>3517990, "ib"=>2220863, "j."=>2341407, "14"=>2471, "dd"=>1108386, "zs"=>4784956, "uu"=>4544207, "px"=>3518017, "ic"=>2221692, "15"=>2558, "de"=>1108520, "uv"=>4544342, "py"=>3518043, "o."=>3029509, "na"=>2919040, "id"=>2226538, "16"=>2733, "df"=>1168182, "zu"=>4784989, "s_"=>3741387, "nb"=>2942448, "ie"=>2230327, "17"=>2788, "dg"=>1168212, "ux"=>4544722, "t-"=>4260104, "nc"=>2942542, "if"=>2230421, "18"=>3024, "dh"=>1168244, "zw"=>4785347, "uy"=>4544863, "t."=>4260425, "sa"=>3741419, "nd"=>2942608, "ig"=>2230448, "19"=>3319, "di"=>1168953, "x_"=>4749915, "uz"=>4544913, "sb"=>3800209, "ne"=>2942718, "dj"=>1223633, "zy"=>4785471, "y-"=>4755272, "sc"=>3800328, "dk"=>1223962, "xa"=>4749991, "sd"=>3836240, "ng"=>2975369, "ii"=>2232411, "dl"=>1224061, "se"=>3836272, "nh"=>2975761, "ij"=>2232906, "dm"=>1224120, "xc"=>4750937, "sf"=>3898201, "ni"=>2975793, "ik"=>2233046, "dn"=>1224525, "sg"=>3898276, "nj"=>2989622, "il"=>2233145, "do"=>1224823, "y2"=>4755339, "xe"=>4750963, "sh"=>3898399, "im"=>2236982, "dp"=>1251815, "si"=>3934020, "nl"=>2989719, "in"=>2250132, "sj"=>3974373, "nm"=>2989774, "io"=>2317192, "dr"=>1251968, "xh"=>4752829, "sk"=>3974412, "ip"=>2319242, "ds"=>1271920, "xi"=>4752879, "sl"=>3982232, "nn"=>2989842, "iq"=>2320204, "dt"=>1272024, "sm"=>3995291, "no"=>2989930, "ir"=>2320265, "du"=>1272082, "sn"=>4003308, "np"=>3016438, "is"=>2328830, "dv"=>1284206, "xl"=>4753577, "so"=>4011968, "it"=>2336645, "b_"=>324352, "dw"=>1284263, "xm"=>4753603, "sp"=>4051506, "nr"=>3016535, "iu"=>2338757, "0"=>1840, "4-"=>5374, "sq"=>4097051, "ns"=>3016775, "iv"=>2338786, "1"=>1865, "ba"=>324554, "c-"=>600455, "dy"=>1286409, "xo"=>4753634, "sr"=>4102220, "nt"=>3016984, "8_"=>6119, "iw"=>2340321, "2"=>4177, "c."=>600659, "dz"=>1289430, "g_"=>1695801, "nu"=>3017043, "ix"=>2340452, "9-"=>6205, "3"=>4985, "bb"=>390069, "40"=>5406, "ss"=>4102507, "nv"=>3026658, "iy"=>2341048, "4"=>5349, "3d"=>5205, "ga"=>1695861, "h-"=>2030546, "st"=>4102714, "nw"=>3026690, "iz"=>2341117, "9/"=>6238, "5"=>5594, "bd"=>390218, "gb"=>1726120, "h."=>2030576, "su"=>4180331, "90"=>6271, "6"=>5818, "be"=>390276, "c2"=>601143, "gc"=>1726268, "xt"=>4753701, "sv"=>4232564, "ny"=>3026772, "7"=>5946, "la"=>2459898, "m-"=>2643999, "gd"=>1726351, "sw"=>4232896, "q_"=>3524972, "8"=>6032, "lb"=>2507825, "m."=>2644096, "44"=>5495, "ge"=>1726452, "xv"=>4753754, "r-"=>3544131, "9"=>6180, "bh"=>428251, "lc"=>2507915, "h2"=>2030821, "sy"=>4247569, "r."=>3544158, "qa"=>3525003, "bi"=>428758, "ld"=>2507971, "v_"=>4545417, "sz"=>4259706, "bj"=>454188, "le"=>2508074, "m1"=>2644168, "xx"=>4753864, "qc"=>3525650, "bk"=>454250, "lf"=>2545647, "m2"=>2644194, "gh"=>1914825, "xy"=>4754258, "w."=>4622501, "va"=>4545477, "bl"=>454276, "lg"=>2545676, "m3"=>2644220, "gi"=>1915953, "qe"=>3525677, "bm"=>487643, "lh"=>2545732, "gj"=>1928001, "vc"=>4562340, "bn"=>487795, "li"=>2545866, "vd"=>4562367, "bo"=>487822, "lj"=>2588790, "gl"=>1928034, "ve"=>4562424, "bp"=>527090, "gm"=>1941192, "vf"=>4587559, "qi"=>3525733, "3r"=>5272, "gn"=>1941253, "br"=>527207, "ll"=>2588826, "go"=>1942339, "vh"=>4587589, "lm"=>2589254, "gp"=>1965489, "a'"=>6392, "bs"=>567010, "vi"=>4587630, "3t"=>5322, "bt"=>567093, "lo"=>2589280, "gr"=>1965634, "bu"=>567123, "lp"=>2623355, "gs"=>2010072, "bv"=>598604, "vl"=>4612572, "qo"=>3526131, "1_"=>3606, "bw"=>598664, "2-"=>4204, "lr"=>2623408, "gu"=>2010162, "8v"=>6153, "k'"=>2405456, "ls"=>2623434, "a-"=>6423, "by"=>598787, "vo"=>4613249, "lt"=>2623463, "e_"=>1290183, "6_"=>5911, "gw"=>2026208, "a."=>6630, "lu"=>2623552, "'h"=>1740, "20"=>4330, "ea"=>1290252, "gy"=>2026300, "21"=>4385, "vr"=>4621015, "lw"=>2635752, "eb"=>1300178, "f."=>1455392, "j_"=>2341888, "qu"=>3526162, "lx"=>2635783, "ec"=>1301281, "70"=>5971, "22"=>4411, "vt"=>4621044, "ly"=>2635907, "ed"=>1308417, "ja"=>2341922, "k-"=>2405491, "23"=>4474, "vu"=>4621076, "qw"=>3544030, "o_"=>3029601, "k."=>2405619, "24"=>4500, "p-"=>3142944, "ee"=>1316159, "25"=>4636, "p."=>3143064, "oa"=>3029664, "ef"=>1316593, "jd"=>2362188, "26"=>4662, "t_"=>4260563, "p/"=>3143308, "ob"=>3030924, "eg"=>1318289, "je"=>2362216, "27"=>4688, "vx"=>4622352, "u-"=>4495612, "oc"=>3037012, "eh"=>1321628, "jf"=>2371138, "k2"=>2405647, "28"=>4714, "vy"=>4622382, "u."=>4495708, "ta"=>4260664, "od"=>3042646, "ei"=>1321758, "29"=>4740, "y_"=>4755366, "tb"=>4295216, "oe"=>3044953, "ej"=>1323919, "'s"=>1771, "jh"=>2371165, "z-"=>4773112, "tc"=>4295357, "of"=>3046532, "ek"=>1324264, "78"=>5997, "ji"=>2371193, "ya"=>4755402, "td"=>4295640, "og"=>3049310, "el"=>1324361, "yb"=>4759174, "te"=>4295669, "oh"=>3049737, "em"=>1348056, "u3"=>4496533, "oi"=>3050182, "en"=>1357595, "oj"=>3052575, "eo"=>1377701, "ye"=>4759264, "th"=>4330947, "ok"=>3052696, "1s"=>3787, "ep"=>1378260, "ti"=>4366648, "ol"=>3053511, "jn"=>2373545, "eq"=>1387580, "yg"=>4767903, "tj"=>4385574, "om"=>3062383, "jo"=>2373601, "er"=>1391721, "yh"=>4767972, "tk"=>4385664, "on"=>3064512, "d'"=>1082835, "es"=>1401937, "yi"=>4768028, "tl"=>4385691, "et"=>1408856, "tm"=>4385787, "oo"=>3070387, "jr"=>2392018, "eu"=>1413487, "tn"=>4385843, "op"=>3071039, "a"=>6297, "ev"=>1427580, "yl"=>4768444, "to"=>4385934, "b"=>323845, "c_"=>601171, "ew"=>1432034, "ym"=>4768512, "tp"=>4413193, "or"=>3081061, "c"=>600316, "ju"=>2392073, "ex"=>1432298, "n'"=>2918885, "5-"=>5619, "os"=>3115959, "d"=>1082786, "jv"=>2405234, "ca"=>601439, "d-"=>1082871, "ey"=>1452457, "yo"=>4768542, "tr"=>4413220, "ot"=>3122137, "e"=>1289463, "cb"=>712079, "d."=>1082934, "ez"=>1454953, "yp"=>4771198, "ts"=>4466928, "ou"=>3124879, "f"=>1455328, "50"=>5765, "yq"=>4771250, "ov"=>3129739, "ha"=>2030856, "i-"=>2219776, "g"=>1695338, "jy"=>2405260, "cc"=>712135, "yr"=>4771279, "tt"=>4467892, "ow"=>3136728, "hb"=>2076148, "i."=>2219806, "h"=>2030472, "cd"=>712198, "tu"=>4467944, "ox"=>3137307, "hc"=>2076182, "i"=>2219725, "ce"=>712729, "n-"=>2918921, "yt"=>4771310, "tv"=>4484640, "oy"=>3141259, "hd"=>2076237, "j"=>2341367, "cf"=>737620, "ma"=>2644246, "yu"=>4771416, "tw"=>4485217, "r_"=>3544308, "oz"=>3142126, "n."=>2918965, "he"=>2076337, "k"=>2405363, "cg"=>737739, "mb"=>2737124, "yv"=>4773040, "tx"=>4490575, "s-"=>3741191, "hf"=>2121232, "l"=>2459527, "ch"=>737800, "mc"=>2737372, "ty"=>4490610, "s."=>3741222, "ra"=>3544339, "hg"=>2121297, "m"=>2643918, "ci"=>811192, "md"=>2738186, "tz"=>4495399, "s/"=>3741360, "rb"=>3580128, "n"=>2918808, "cj"=>827445, "me"=>2738337, "x-"=>4749199, "rc"=>3580216, "o"=>3029204, "hh"=>2121341, "mf"=>2788090, "wa"=>4622931, "p"=>3142904, "hi"=>2121371, "cl"=>827472, "mg"=>2788180, "wb"=>4654707, "re"=>3580247, "q"=>3524944, "cm"=>860967, "mh"=>2788224, "rf"=>3658425, "r"=>3544069, "cn"=>861094, "mi"=>2788281, "s"=>3741105, "rg"=>3658504, "co"=>861878, "hl"=>2139669, "we"=>4654819, "t"=>4259917, "rh"=>3658530, "cp"=>1012981, "mk"=>2830687, "hm"=>2139701, "u"=>4495561, "ri"=>3667785, "ml"=>2830716, "hn"=>2139877, "v"=>4545170, "cr"=>1013175, "ho"=>2139935, "wh"=>4672549, "w"=>4622437, "cs"=>1048516, "mm"=>2830804, "hp"=>2182075, "x"=>4749153, "wi"=>4692782, "ct"=>1048663, "mn"=>2830893, "4t"=>5531, "hq"=>2182104, "y"=>4755232, "cu"=>1049194, "mo"=>2831144, "hr"=>2182134, "z"=>4773075, "rn"=>3687863, "cv"=>1068811, "mp"=>2881103, "hs"=>2182299, "wl"=>4724359, "ro"=>3688004, "2_"=>4766, "cw"=>1068869, "4w"=>5558, "ht"=>2182563, "a_"=>7001, "wm"=>4724387, "rp"=>3724343, "l'"=>2459588, "3-"=>5010, "mr"=>2881244, "hu"=>2182649, "wn"=>4724445, "b-"=>323934, "cy"=>1068938, "ms"=>2881650, "wo"=>4724472, "b."=>324186, "cz"=>1082090, "mt"=>2881981, "f_"=>1455546, "hw"=>2196252, "aa"=>7256, "wp"=>4743059, "30"=>5078, "mu"=>2882421, ".2"=>1811, "ab"=>8002},
|
14
|
+
:adj => {"2d"=>4592, "31"=>4851, "fa"=>261714, ".3"=>1880, "hy"=>340391, "ac"=>12021, "32"=>4905, "wr"=>818088, ".4"=>2020, "ad"=>18614, "80"=>7432, "ae"=>23100, "ru"=>595594, "ka"=>388840, "l-"=>392296, "33"=>4959, "my"=>455103, "81"=>7486, "af"=>24303, "34"=>5013, "fe"=>267964, "82"=>7512, "ag"=>26180, "rw"=>598392, "35"=>5067, "83"=>7538, "ah"=>28531, "36"=>5121, "pa"=>505816, "84"=>7564, "ai"=>28659, "ke"=>389430, "37"=>5175, "85"=>7590, "aj"=>29502, "38"=>5229, "v-"=>789182, "86"=>7644, "ak"=>29530, "39"=>5283, "fi"=>270598, "wy"=>819172, "87"=>7670, "al"=>29597, "2n"=>4618, "ub"=>722231, "pe"=>515525, "am"=>36386, "kh"=>389925, "88"=>7696, "an"=>40420, "ki"=>389956, "za"=>822848, "fl"=>275697, "89"=>7722, "ao"=>51761, "ph"=>523560, "ap"=>51861, "pi"=>526560, "aq"=>56753, "ug"=>722267, "fo"=>280302, "ar"=>57023, "ze"=>823049, "as"=>62836, "pl"=>530025, "at"=>67162, "kn"=>390913, "fr"=>286656, "au"=>69848, "ko"=>391903, "uk"=>722361, "pn"=>534250, "7t"=>7346, "av"=>73727, "zi"=>823276, "ul"=>722396, "po"=>534415, "aw"=>74649, "um"=>723080, "fu"=>290894, "ax"=>75702, "un"=>723684, "az"=>76033, "up"=>785441, "6-"=>6566, "pr"=>541620, "10"=>2210, "ku"=>392129, "ps"=>556687, "da"=>189020, "zo"=>823490, "pt"=>558010, "11"=>2525, "ur"=>787459, "60"=>6600, "pu"=>558096, "12"=>2691, "us"=>788002, "61"=>6654, "ia"=>343317, "13"=>2857, "ky"=>392202, "ut"=>788423, "62"=>6680, "ib"=>343385, "14"=>3023, "63"=>6706, "ic"=>343452, "15"=>3189, "de"=>191816, "na"=>456102, "o."=>482752, "64"=>6732, "py"=>562611, "id"=>344057, "16"=>3355, "uv"=>788962, "65"=>6786, "17"=>3521, "if"=>345034, "18"=>3687, "t-"=>685912, "ux"=>789057, "66"=>6840, "ig"=>345064, "19"=>3797, "sa"=>598455, "di"=>206077, "67"=>6866, "dj"=>220263, "uz"=>789122, "ne"=>459116, "zy"=>823837, "y-"=>820785, "68"=>6892, "sc"=>604552, "69"=>6918, "xa"=>819260, "ii"=>345421, "se"=>608988, "xc"=>819292, "ni"=>464278, "il"=>345474, "do"=>220299, "y2"=>820817, "xe"=>819571, "im"=>347578, "sh"=>623055, "in"=>353335, "si"=>629245, "io"=>380365, "dr"=>225168, "ip"=>380637, "sk"=>635321, "5t"=>6514, "xi"=>819747, "sl"=>636220, "ir"=>380672, "sm"=>639638, "du"=>227825, "no"=>466328, "is"=>382721, "sn"=>641766, "xl"=>819882, "it"=>383968, "so"=>643211, "dw"=>229873, "0"=>2160, "sp"=>649763, "4-"=>5389, "iv"=>384300, "1"=>2185, "ba"=>76418, "sq"=>656684, "dy"=>229940, "2"=>3934, "sr"=>658125, "nt"=>481308, "ix"=>384390, "9-"=>7800, "3"=>4645, "40"=>5460, "nu"=>481335, "4"=>5364, "41"=>5570, "ga"=>294211, "h-"=>313795, "5"=>6083, "st"=>658188, "42"=>5624, "90"=>7834, "6"=>6541, "be"=>83696, "su"=>671222, "43"=>5678, "91"=>7888, "7"=>6971, "sv"=>681720, "la"=>392328, "ny"=>482674, "92"=>7914, "8"=>7373, "sw"=>681768, "44"=>5732, "ge"=>296692, "xv"=>820134, "93"=>7940, "9"=>7775, "bh"=>89324, "45"=>5786, "r."=>566629, "94"=>7966, "bi"=>89359, "sy"=>683436, "46"=>5840, "qa"=>563600, "95"=>7992, "le"=>398566, "47"=>5894, "xx"=>820244, "96"=>8046, "48"=>5948, "w-"=>800937, "gh"=>299675, "97"=>8072, "bl"=>96334, "49"=>6002, "va"=>789214, "gi"=>299913, "98"=>8098, "li"=>402864, "99"=>8124, "bo"=>102434, "gl"=>300821, "ve"=>791850, "3r"=>5337, "gn"=>303067, "br"=>107474, "go"=>303237, "vi"=>795294, "lo"=>409658, "gr"=>305879, "bu"=>114508, "8t"=>7748, "2-"=>3959, "gu"=>312309, "a-"=>8177, "by"=>118813, "vo"=>799000, "a."=>8266, "lu"=>416153, "7-"=>6996, "20"=>3996, "lv"=>417909, "ea"=>230712, "gy"=>313409, "21"=>4106, "eb"=>232434, "lx"=>418019, "ec"=>232594, "70"=>7030, "qu"=>563668, "22"=>4160, "ly"=>418802, "ed"=>233599, "71"=>7084, "ja"=>384470, "23"=>4214, "vu"=>800557, "72"=>7110, "24"=>4268, "ee"=>234244, "73"=>7136, "25"=>4322, "ef"=>234377, "oa"=>482782, "p."=>505788, "74"=>7162, "26"=>4376, "eg"=>235115, "ob"=>482870, "75"=>7188, "je"=>385496, "27"=>4430, "u-"=>722199, "oc"=>484785, "76"=>7242, "28"=>4484, "ei"=>235526, "od"=>485595, "29"=>4538, "ta"=>685944, "oe"=>486011, "77"=>7268, "of"=>486090, "78"=>7294, "ji"=>386292, "ya"=>820854, "el"=>236340, "79"=>7320, "em"=>238831, "oh"=>487955, "te"=>690141, "en"=>240855, "oi"=>487986, "eo"=>245834, "ye"=>820972, "ok"=>488175, "1s"=>3907, "th"=>695473, "ep"=>245983, "ol"=>488233, "ti"=>702295, "eq"=>247606, "om"=>489492, "jo"=>386595, "er"=>248382, "on"=>489888, "6t"=>6944, "es"=>249587, "yi"=>822204, "et"=>250446, "oo"=>493273, "jr"=>387299, "eu"=>251131, "op"=>493333, "ev"=>252105, "to"=>705018, "or"=>495731, "c"=>118928, "ju"=>387326, "ex"=>253536, "5-"=>6108, "os"=>498199, "d"=>188965, "ca"=>118953, "ey"=>261468, "yo"=>822254, "ot"=>498747, "tr"=>709204, "d."=>188990, "ou"=>499048, "ts"=>717450, "50"=>6142, "ov"=>501567, "ha"=>313827, "cc"=>131547, "51"=>6252, "ow"=>505326, "cd"=>131600, "52"=>6278, "i"=>343292, "ce"=>131626, "tu"=>717516, "n-"=>456074, "53"=>6304, "ox"=>505512, "ma"=>419357, "54"=>6330, "yu"=>822690, "he"=>320515, "k"=>388815, "tw"=>719084, "l"=>392271, "s-"=>598423, "ch"=>134647, "55"=>6356, "ra"=>566657, "m"=>419332, "ci"=>142370, "ty"=>721720, "56"=>6410, "tz"=>722141, "me"=>429139, "57"=>6436, "58"=>6462, "x-"=>819228, "hi"=>327145, "cl"=>144311, "59"=>6488, "wa"=>800969, "re"=>571244, "mi"=>436715, "co"=>150272, "rh"=>587466, "we"=>804320, "hm"=>331476, "ri"=>588287, "u"=>722172, "v"=>789157, "cr"=>176976, "ho"=>331505, "wh"=>809316, "x"=>819203, "ct"=>183185, "mn"=>442650, "4t"=>6056, "wi"=>811263, "cu"=>183216, "mo"=>442756, "9t"=>8150, "cv"=>186983, "ro"=>591099, "a_"=>8294, "cx"=>187009, "3-"=>4670, "hu"=>337473, "cy"=>187229, "cz"=>188827, "wo"=>814766, "30"=>4741, "mu"=>450942, ".2"=>1740, "8-"=>7398, "ab"=>8553},
|
15
|
+
:verb => {"ox"=>317944, "ep"=>169705, "ki"=>261865, "ne"=>302629, "x-"=>522318, "oy"=>318231, "ru"=>392475, "ur"=>502622, "bu"=>56124, "eq"=>169837, "oz"=>318263, "us"=>502895, "aa"=>1740, "er"=>170250, "ut"=>503162, "ab"=>1767, "es"=>170652, "ho"=>232836, "ac"=>3529, "et"=>171392, "ni"=>303882, "ta"=>465047, "ad"=>6490, "da"=>118966, "by"=>61875, "eu"=>171927, "ae"=>8718, "ev"=>172155, "kn"=>263317, "af"=>8929, "ko"=>264585, "ag"=>9437, "ex"=>173105, "te"=>472676, "wa"=>508666, "ga"=>204003, "ey"=>178534, "hu"=>237591, "ai"=>10286, "de"=>120649, "no"=>304585, "th"=>475571, "ti"=>478689, "we"=>512699, "za"=>523414, "ge"=>206371, "ja"=>255802, "hy"=>239280, "al"=>10951, "di"=>136972, "kv"=>264661, "am"=>12800, "wh"=>514913, "an"=>13706, "dj"=>148986, "ze"=>523472, "wi"=>517343, "je"=>256873, "ma"=>280541, "nu"=>305772, "ap"=>16409, "gh"=>210633, "to"=>481065, "aq"=>18267, "gi"=>210790, "ar"=>18347, "zi"=>523585, "as"=>19764, "do"=>149014, "pa"=>318325, "qu"=>356569, "tr"=>483983, "at"=>21915, "gl"=>213421, "ji"=>257383, "me"=>287710, "ts"=>490986, "wo"=>519567, "au"=>23027, "av"=>23935, "dr"=>152421, "gn"=>215270, "g."=>203975, "aw"=>24513, "go"=>215409, "tu"=>491013, "wr"=>520951, "ax"=>24704, "mi"=>290558, "pe"=>324550, "sa"=>395891, "zo"=>523781, "ca"=>61939, "du"=>158291, "tw"=>493797, "az"=>24779, "gr"=>218976, "sc"=>398961, "dw"=>159135, "jo"=>257765, "ph"=>328230, "pi"=>328891, "se"=>403889, "va"=>503309, "ty"=>494840, "fa"=>178632, "dy"=>159383, "gu"=>222269, "ce"=>72643, "mo"=>295025, "pl"=>332016, "sh"=>410924, "si"=>417650, "ve"=>504664, "ya"=>522393, "ch"=>73529, "fe"=>183011, "gy"=>223294, "ju"=>258735, "ci"=>81247, "po"=>336199, "sk"=>421199, "ic"=>240465, "sl"=>422915, "id"=>240617, "sm"=>427144, "vi"=>506088, "ye"=>522895, "cl"=>82562, "la"=>264691, "mu"=>299509, "sn"=>428534, "fi"=>185556, "pr"=>341164, "so"=>430811, "ig"=>240935, "ps"=>350290, "sp"=>433886, "co"=>87668, "pt"=>350435, "sq"=>440723, "yi"=>523082, "fl"=>189489, "le"=>268258, "my"=>301203, "pu"=>350501, "ob"=>306433, "vo"=>507446, "cr"=>110051, "oc"=>307497, "fo"=>194383, "od"=>307842, "st"=>442042, "il"=>241065, "li"=>271717, "ra"=>358635, "py"=>356507, "su"=>454237, "vr"=>508439, "cu"=>115910, "im"=>241391, "of"=>307921, "yo"=>523290, "ba"=>24810, "fr"=>199363, "in"=>244059, "og"=>308488, "sw"=>460901, "io"=>254520, "vu"=>508468, "ft"=>202432, "oi"=>308518, "re"=>363091, "sy"=>463729, "ea"=>159598, "cy"=>118743, "fu"=>202461, "ir"=>254766, "ok"=>308584, "be"=>30231, "eb"=>160312, "lo"=>275662, "rh"=>386786, "ec"=>160555, "is"=>255120, "om"=>308616, "ri"=>386952, "ed"=>160739, "it"=>255504, "on"=>308689, "ha"=>223396, "ug"=>495123, "bi"=>37946, "o."=>306373, "ef"=>161223, "oo"=>308723, "xe"=>522360, "eg"=>161607, "op"=>308863, "bl"=>39622, "he"=>228710, "ka"=>259787, "lu"=>279432, "or"=>309814, "ro"=>389309, "ej"=>161771, "os"=>310691, "ul"=>495155, "bo"=>44472, "ek"=>161875, "um"=>495274, "el"=>161933, "ke"=>259848, "na"=>301427, "ly"=>280367, "ou"=>310976, "un"=>495308, "em"=>162861, "hi"=>231023, "ov"=>313178, "e-"=>159560, "br"=>48932, "en"=>164868, "ow"=>317836, "up"=>501838},
|
16
|
+
:adv => {"ul"=>146918, "sa"=>121999, "me"=>87949, "is"=>79726, "al"=>6456, "fu"=>54854, "ty"=>146844, "ro"=>121173, "op"=>100918, "it"=>79763, "am"=>8151, "ba"=>15505, "gi"=>56230, "va"=>156823, "un"=>147057, "sc"=>122613, "pe"=>105172, "an"=>8719, "cy"=>31883, "or"=>101364, "ea"=>40812, "i."=>64191, "up"=>155754, "se"=>123441, "os"=>101738, "mi"=>89536, "lu"=>85885, "eb"=>41346, "ap"=>10042, "bc"=>16615, "gl"=>56390, "ph"=>107333, "ot"=>101845, "ha"=>58556, "do"=>38706, "ec"=>41381, "ve"=>157238, "ur"=>156560, "pi"=>107876, "ou"=>101917, "o."=>96668, "ar"=>10688, "be"=>16664, "ed"=>41593, "us"=>156624, "sh"=>125358, "ov"=>102923, "go"=>56887, "as"=>11428, "ye"=>162287, "ut"=>156758, "si"=>126337, "ru"=>121807, "ow"=>103605, "ly"=>86271, "dr"=>39948, "ee"=>41743, "at"=>12932, "pl"=>108832, "na"=>92504, "he"=>60006, "ef"=>41771, "au"=>14641, "jo"=>80066, "vi"=>157846, "sk"=>127582, "bi"=>18176, "gr"=>57195, "eg"=>41998, "a."=>1802, "av"=>14980, "sl"=>127918, "pn"=>109745, "mo"=>90407, "ke"=>80998, "du"=>40494, "aw"=>15087, "yi"=>162508, "sm"=>128914, "po"=>109782, "ei"=>42035, "ax"=>15383, "ux"=>156789, "ta"=>138198, "sn"=>129359, "ne"=>92966, "gu"=>58424, "bl"=>18795, "so"=>129638, "ca"=>22029, "hi"=>61819, "wa"=>159164, "sp"=>132158, "dy"=>40699, "el"=>42063, "ju"=>80410, "ki"=>81028, "vo"=>158698, "sq"=>133295, "pr"=>111025, "em"=>42369, "fa"=>47339, "bo"=>19295, "te"=>138977, "ps"=>114778, "ni"=>94052, "mu"=>91756, "en"=>42630, "yo"=>162542, "e'"=>40734, "we"=>159590, "pu"=>114861, "p."=>103637, "ib"=>64217, "ep"=>43786, "br"=>19838, "ce"=>23083, "th"=>140092, "st"=>133477, "ic"=>64297, "eq"=>43822, "fe"=>48826, "cf"=>23412, "la"=>81356, "ho"=>62489, "ze"=>162691, "ti"=>142646, "su"=>135818, "my"=>92434, "id"=>64326, "er"=>43965, "kn"=>81226, "wh"=>160191, "vu"=>159098, "es"=>44216, "bu"=>20620, "ch"=>23461, "ie"=>64701, "wi"=>160770, "sw"=>137556, "py"=>115529, "ob"=>96696, "et"=>44370, "a_"=>1884, "b."=>15451, "ci"=>24468, "if"=>64725, "ra"=>116360, "oc"=>97499, "no"=>94335, "le"=>82562, "ig"=>64753, "eu"=>44719, "zi"=>162788, "sy"=>137715, "od"=>97535, "fi"=>49357, "e."=>40786, "ev"=>44758, "by"=>20853, "cl"=>24639, "hu"=>63384, "to"=>143107, "of"=>97605, "da"=>31998, "ex"=>45689, "ab"=>2343, "re"=>117290, "li"=>83328, "ac"=>3468, "fl"=>50352, "wo"=>161523, "ga"=>55429, "co"=>25439, "ad"=>4138, "tr"=>145364, "nu"=>96480, "hy"=>63857, "il"=>64855, "ae"=>5138, "rh"=>120391, "im"=>65172, "af"=>5207, "ja"=>79789, "fo"=>50969, "wr"=>162050, "ri"=>120500, "qu"=>115566, "ok"=>98535, "cr"=>30383, "de"=>32710, "in"=>67015, "ag"=>5747, "ma"=>86304, "'t"=>1740, "ah"=>6150, "ge"=>55713, "tu"=>146503, "om"=>98587, "ip"=>78991, "ai"=>6367, "fr"=>53467, "pa"=>103691, "on"=>98620, "lo"=>84939, "je"=>79886, "cu"=>31304, "tw"=>146716, "o'"=>96613, "di"=>35621, "ir"=>79023, "ak"=>6428, "c."=>21975}
|
17
|
+
}
|
18
|
+
|
19
|
+
# Hash object used for caching retreved terms to further improve retreval performance
|
20
|
+
WORDS_CACHE = Hash.new
|
21
|
+
|
22
|
+
## Returns the current connection status of the wordnet object.
|
23
|
+
#
|
24
|
+
# @return [true, false] The current connection status of the wordnet object.
|
25
|
+
attr_reader :connected
|
26
|
+
|
27
|
+
## Returns the current connection status of the wordnet object.
|
28
|
+
#
|
29
|
+
# @return [true, false] The current connection status of the wordnet object.
|
30
|
+
alias :connected? connected
|
31
|
+
|
32
|
+
# Returns the type of the current wordnet connection.
|
33
|
+
#
|
34
|
+
# @return [Symbol] The current wordnet connection type. Currently supported :pure & :tokyo.
|
35
|
+
attr_reader :connection_type
|
36
|
+
|
37
|
+
# Returns the datapath currently in use (this may be irrelevent when using the pure connector and thus could be nil.)
|
38
|
+
#
|
39
|
+
# @return [Pathname, nil] The path to the data directory currently in use. Returns nil if unknown.
|
40
|
+
attr_reader :data_path
|
41
|
+
|
42
|
+
# Returns the path to the wordnet collection currently in use (this may be irrelevent when using the tokyo connector and thus could be nil.)
|
43
|
+
#
|
44
|
+
# @return [Pathname, nil] The path to the wordnet collection currently in use. Returns nil if unknown.
|
45
|
+
attr_reader :wordnet_path
|
46
|
+
|
47
|
+
# Constructs a new pure ruby connector for use with the words wordnet class.
|
48
|
+
#
|
49
|
+
# @param [Pathname] data_path Specifies the directory within which constructed datasets can be found (evocations etc...)
|
50
|
+
# @param [Pathname] wordnet_path Specifies the directory within which the wordnet dictionary can be found.
|
51
|
+
# @return [PureWordnetConnection] A new wordnet connection.
|
52
|
+
# @raise [BadWordnetConnector] If an invalid connector type is provided.
|
53
|
+
def initialize(data_path, wordnet_path)
|
54
|
+
|
55
|
+
@data_path, @wordnet_path, @connection_type, @connected = data_path, wordnet_path, :pure, false
|
56
|
+
|
57
|
+
open!
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
# Causes the connection specified within the wordnet object to be reopened if currently closed.
|
62
|
+
#
|
63
|
+
# @raise [BadWordnetConnector] If an invalid connector type is provided.
|
64
|
+
def open!
|
65
|
+
|
66
|
+
raise BadWordnetDataset, "Failed to locate the wordnet database. Please ensure it is installed and that if it resides at a custom path that path is given as an argument when constructing the Words object." if @wordnet_path.nil?
|
67
|
+
|
68
|
+
@connected = true
|
69
|
+
|
70
|
+
# try and open evocations too
|
71
|
+
evocation_path = @data_path + 'evocations.dmp'
|
72
|
+
File.open(evocation_path, 'r') do |file|
|
73
|
+
@evocations = Marshal.load file.read
|
74
|
+
end if evocation_path.exist?
|
75
|
+
return nil
|
76
|
+
|
77
|
+
end
|
78
|
+
|
79
|
+
# Causes the current connection to wordnet to be closed.
|
80
|
+
#
|
81
|
+
def close!
|
82
|
+
|
83
|
+
@connected = false
|
84
|
+
return nil
|
85
|
+
|
86
|
+
end
|
87
|
+
|
88
|
+
# Locates from a term any relevent homographs and constructs a homographs hash.
|
89
|
+
#
|
90
|
+
# @param [String] term The specific term that is desired from within wordnet.
|
91
|
+
# @param [true, false] use_cache Specify whether to use caching when finding and retreving terms.
|
92
|
+
# @result [Hash, nil] A hash in the format { 'lemma' => ..., 'tagsense_counts' => ..., 'synset_ids' => ... }, or nil if no homographs are available.
|
93
|
+
# @raise [NoWordnetConnection] If there is currently no wordnet connection.
|
94
|
+
def homographs(term, use_cache = true)
|
95
|
+
|
96
|
+
raise NoWordnetConnection, "There is presently no connection to wordnet. To attempt to reistablish a connection you should use the 'open!' command on the Wordnet object." unless connected?
|
97
|
+
|
98
|
+
# Ensure that the term is either in the cache. If not, locate and add it if possable.
|
99
|
+
cache_ensure_from_wordnet(term, use_cache)
|
100
|
+
|
101
|
+
# We should either have the word in cache now or nowt... we should now change that into homograph input format (we do this here to improve performance during the cacheing performed above)
|
102
|
+
cached_entry_to_homograph_hash(term)
|
103
|
+
|
104
|
+
end
|
105
|
+
|
106
|
+
# Locates from a synset_id a specific synset and constructs a synset hash.
|
107
|
+
#
|
108
|
+
# @param [String] synset_id The synset id to locate.
|
109
|
+
# @result [Hash, nil] A hash in the format { "synset_id" => ..., "lexical_filenum" => ..., "synset_type" => ..., "words" => ..., "relations" => ..., "gloss" => ... }, or nil if no synset is available.
|
110
|
+
# @raise [NoWordnetConnection] If there is currently no wordnet connection.
|
111
|
+
def synset(synset_id)
|
112
|
+
|
113
|
+
raise NoWordnetConnection, "There is presently no connection to wordnet. To attempt to reistablish a connection you should use the 'open!' command on the Wordnet object." unless connected?
|
114
|
+
|
115
|
+
pos = synset_id[0,1]
|
116
|
+
File.open(@wordnet_path + "data.#{SHORT_TO_POS_FILE_TYPE[pos]}","r") do |file|
|
117
|
+
file.seek(synset_id[1..-1].to_i)
|
118
|
+
data_line, gloss = file.readline.strip.split(" | ")
|
119
|
+
lexical_filenum, synset_type, word_count, *data_parts = data_line.split(" ")[1..-1]
|
120
|
+
words = Array.new(word_count.to_i(16)).map { "#{data_parts.shift}.#{data_parts.shift}" }
|
121
|
+
relations = Array.new(data_parts.shift.to_i).map { "#{data_parts.shift}.#{data_parts.shift}.#{data_parts.shift}.#{data_parts.shift}" }
|
122
|
+
return { "synset_id" => synset_id, "lexical_filenum" => lexical_filenum, "synset_type" => synset_type, "words" => words.join('|'), "relations" => relations.join('|'), "gloss" => gloss.strip }
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
126
|
+
|
127
|
+
# Returns wheter evocations are currently avalable to use with the current wordnet object. (More information on setting these up can be found within the README)
|
128
|
+
#
|
129
|
+
# @return [true, false] Whether evocations are currently available or not.
|
130
|
+
def evocations?
|
131
|
+
|
132
|
+
!evocations('n08112402').nil?
|
133
|
+
|
134
|
+
end
|
135
|
+
|
136
|
+
# Locates from a synset id any relevent evocations and constructs an evocations hash.
|
137
|
+
#
|
138
|
+
# @see Synset
|
139
|
+
# @param [String] senset_id The id number of a specific synset.
|
140
|
+
# @result [Hash, nil] A hash in the format { 'relations' => ..., 'means' => ..., 'medians' => ... }, or nil if no evocations are available.
|
141
|
+
# @raise [NoWordnetConnection] If there is currently no wordnet connection.
|
142
|
+
def evocations(synset_id)
|
143
|
+
|
144
|
+
raise NoWordnetConnection, "There is presently no connection to wordnet. To attempt to reistablish a connection you should use the 'open!' command on the Wordnet object." unless connected?
|
145
|
+
|
146
|
+
if defined? @evocations
|
147
|
+
raw_evocations = @evocations[synset_id + "s"]
|
148
|
+
{ 'relations' => raw_evocations[0], 'means' => raw_evocations[1], 'medians' => raw_evocations[2]} unless raw_evocations.nil?
|
149
|
+
else
|
150
|
+
nil
|
151
|
+
end
|
152
|
+
|
153
|
+
end
|
154
|
+
|
155
|
+
# Provides a textural description of the current connection state of the Wordnet object.
|
156
|
+
#
|
157
|
+
# @return [String] A textural description of the current connection state of the Wordnet object. e.g. "Words not Connected" or "Words running in pure mode using wordnet files found at /opt/wordnet"
|
158
|
+
def to_s
|
159
|
+
|
160
|
+
"Words running in pure mode using wordnet files found at #{wordnet_path}"
|
161
|
+
|
162
|
+
end
|
163
|
+
|
164
|
+
private
|
165
|
+
|
166
|
+
def cache_ensure_from_wordnet(term, use_cache)
|
167
|
+
|
168
|
+
# clean up the term
|
169
|
+
term = term.gsub(" ", "_").downcase
|
170
|
+
|
171
|
+
# identify the term initials
|
172
|
+
term_initials = term[0,2]
|
173
|
+
|
174
|
+
# for each index we have
|
175
|
+
INDEXES.keys.each do |index_pos|
|
176
|
+
next unless INDEXES[index_pos].include? term_initials # if the index does not contain the desired word skip the index
|
177
|
+
file = File.new(@wordnet_path + "index.#{index_pos}") # open wordnet index file
|
178
|
+
file.seek INDEXES[index_pos][term_initials] # seek to the index starting point
|
179
|
+
|
180
|
+
while (line = file.gets) && (term_initials == line[0,2]) # break if line if EOF or we are past the term and thus the line doesnt start with the term initials
|
181
|
+
break if construct_cache_item(line, term, use_cache, index_pos)
|
182
|
+
end
|
183
|
+
|
184
|
+
file.close # close wordnet index file
|
185
|
+
end unless WORDS_CACHE.include?(term) && use_cache # if we have the term already and are ok with using cache then simply use that!
|
186
|
+
|
187
|
+
end
|
188
|
+
|
189
|
+
def construct_cache_item(line, term, use_cache, index_pos)
|
190
|
+
|
191
|
+
lemma, pos, *index_parts = line.split(' ') # split the line and split off the lemma
|
192
|
+
if (lemma == term || use_cache) # if it's the term we are after or we are using cache then we save the word
|
193
|
+
WORDS_CACHE[lemma] ||= [ lemma ] # ensure that there is datastructure to hold our word information
|
194
|
+
if !WORDS_CACHE[lemma].include?(index_pos) # unless there already exists an entry for said word associated with the current index
|
195
|
+
tagsense_count, *synset_offsets = index_parts.slice(index_parts[1].to_i+3..-1) # seperate out what is useful from the index as a whole
|
196
|
+
WORDS_CACHE[lemma] += [ pos, tagsense_count.to_i, synset_offsets ] # add the tagsense_count and the synsets for the pos
|
197
|
+
return true if lemma == term # if we have the word in this index then we can jump out and check the next index
|
198
|
+
end
|
199
|
+
end
|
200
|
+
return false
|
201
|
+
|
202
|
+
end
|
203
|
+
|
204
|
+
def cached_entry_to_homograph_hash(term)
|
205
|
+
|
206
|
+
lemma, *raw_homographs = WORDS_CACHE[term] # split the homograph
|
207
|
+
unless raw_homographs.empty? # if we have something... format it
|
208
|
+
tagsense_counts = Array.new
|
209
|
+
synset_ids = Array.new
|
210
|
+
while !raw_homographs.empty?
|
211
|
+
pos = raw_homographs.shift
|
212
|
+
tagsense_counts << "#{pos}#{raw_homographs.shift}"
|
213
|
+
synset_ids += raw_homographs.shift.map { |sense_offset| "#{pos}#{sense_offset}" }
|
214
|
+
end
|
215
|
+
return { 'lemma' => lemma, 'tagsense_counts' => tagsense_counts.join('|'), 'synset_ids' => synset_ids.join('|') }
|
216
|
+
else
|
217
|
+
return nil # we return nil if we haven't found the term
|
218
|
+
end
|
219
|
+
|
220
|
+
end
|
221
|
+
|
222
|
+
end
|
223
|
+
|
224
|
+
end
|
@@ -0,0 +1,141 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
# gem includes
|
4
|
+
require 'rubygems'
|
5
|
+
require 'rufus-tokyo' if Gem.available?('rufus-tokyo')
|
6
|
+
|
7
|
+
module Words
|
8
|
+
|
9
|
+
# Provides a pure tokyo cabinate connector to the Wordnet dataset.
|
10
|
+
class TokyoWordnetConnection
|
11
|
+
|
12
|
+
## Returns the current connection status of the wordnet object.
|
13
|
+
#
|
14
|
+
# @return [true, false] The current connection status of the wordnet object.
|
15
|
+
attr_reader :connected
|
16
|
+
|
17
|
+
## Returns the current connection status of the wordnet object.
|
18
|
+
#
|
19
|
+
# @return [true, false] The current connection status of the wordnet object.
|
20
|
+
alias :connected? connected
|
21
|
+
|
22
|
+
# Returns the type of the current wordnet connection.
|
23
|
+
#
|
24
|
+
# @return [Symbol] The current wordnet connection type. Currently supported :pure & :tokyo.
|
25
|
+
attr_reader :connection_type
|
26
|
+
|
27
|
+
# Returns the datapath currently in use (this may be irrelevent when using the pure connector and thus could be nil.)
|
28
|
+
#
|
29
|
+
# @return [Pathname, nil] The path to the data directory currently in use. Returns nil if unknown.
|
30
|
+
attr_reader :data_path
|
31
|
+
|
32
|
+
# Returns the path to the wordnet collection currently in use (this may be irrelevent when using the tokyo connector and thus could be nil.)
|
33
|
+
#
|
34
|
+
# @return [Pathname, nil] The path to the wordnet collection currently in use. Returns nil if unknown.
|
35
|
+
attr_reader :wordnet_path
|
36
|
+
|
37
|
+
# Constructs a new tokyo ruby connector for use with the words wordnet class.
|
38
|
+
#
|
39
|
+
# @param [Pathname] data_path Specifies the directory within which constructed datasets can be found (tokyo index, evocations etc...)
|
40
|
+
# @param [Pathname] wordnet_path Specifies the directory within which the wordnet dictionary can be found.
|
41
|
+
# @return [PureWordnetConnection] A new wordnet connection.
|
42
|
+
# @raise [BadWordnetConnector] If an invalid connector type is provided.
|
43
|
+
def initialize(data_path, wordnet_path)
|
44
|
+
|
45
|
+
@data_path, @wordnet_path, @connection_type, @connected = data_path + 'wordnet.tct', wordnet_path, :tokyo, false
|
46
|
+
|
47
|
+
# ensure we have the rufus gem loaded, else there is little point in continuing...
|
48
|
+
raise BadWordnetConnector, "Coulden't find the rufus-tokyo gem. Please ensure it's installed." unless Gem.available?('rufus-tokyo')
|
49
|
+
|
50
|
+
open!
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
# Causes the connection specified within the wordnet object to be reopened if currently closed.
|
55
|
+
#
|
56
|
+
# @raise [BadWordnetConnector] If an invalid connector type is provided.
|
57
|
+
def open!
|
58
|
+
|
59
|
+
unless connected?
|
60
|
+
if @data_path.exist?
|
61
|
+
@connection = Rufus::Tokyo::Table.new(@data_path.to_s, :mode => 'r')
|
62
|
+
@connected = true
|
63
|
+
else
|
64
|
+
@connected = false
|
65
|
+
raise BadWordnetDataset, "Failed to locate the tokyo words dataset at #{@data_path}. Please insure you have created it using the words gems provided 'build_wordnet' command."
|
66
|
+
end
|
67
|
+
end
|
68
|
+
return nil
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
# Causes the current connection to wordnet to be closed.
|
73
|
+
#
|
74
|
+
def close!
|
75
|
+
|
76
|
+
if connected?
|
77
|
+
@connection.close
|
78
|
+
@connected = false
|
79
|
+
end
|
80
|
+
return nil
|
81
|
+
|
82
|
+
end
|
83
|
+
|
84
|
+
# Locates from a term any relevent homographs and constructs a homographs hash.
|
85
|
+
#
|
86
|
+
# @param [String] term The specific term that is desired from within wordnet.
|
87
|
+
# @result [Hash, nil] A hash in the format { 'lemma' => ..., 'tagsense_counts' => ..., 'synset_ids' => ... }, or nil if no homographs are available.
|
88
|
+
# @raise [NoWordnetConnection] If there is currently no wordnet connection.
|
89
|
+
def homographs(term)
|
90
|
+
|
91
|
+
raise NoWordnetConnection, "There is presently no connection to wordnet. To attempt to reistablish a connection you should use the 'open!' command on the Wordnet object." unless connected?
|
92
|
+
@connection[term]
|
93
|
+
|
94
|
+
end
|
95
|
+
|
96
|
+
# Locates from a synset_id a specific synset and constructs a synset hash.
|
97
|
+
#
|
98
|
+
# @param [String] synset_id The synset id to locate.
|
99
|
+
# @result [Hash, nil] A hash in the format { "synset_id" => ..., "lexical_filenum" => ..., "synset_type" => ..., "words" => ..., "relations" => ..., "gloss" => ... }, or nil if no synset is available.
|
100
|
+
# @raise [NoWordnetConnection] If there is currently no wordnet connection.
|
101
|
+
def synset(synset_id)
|
102
|
+
|
103
|
+
raise NoWordnetConnection, "There is presently no connection to wordnet. To attempt to reistablish a connection you should use the 'open!' command on the Wordnet object." unless connected?
|
104
|
+
@connection[synset_id]
|
105
|
+
|
106
|
+
end
|
107
|
+
|
108
|
+
# Returns wheter evocations are currently avalable to use with the current wordnet object. (More information on setting these up can be found within the README)
|
109
|
+
#
|
110
|
+
# @return [true, false] Whether evocations are currently available or not.
|
111
|
+
def evocations?
|
112
|
+
|
113
|
+
!evocations('n08112402').nil?
|
114
|
+
|
115
|
+
end
|
116
|
+
|
117
|
+
# Locates from a synset id any relevent evocations and constructs an evocations hash.
|
118
|
+
#
|
119
|
+
# @see Synset
|
120
|
+
# @param [String] senset_id The id number of a specific synset.
|
121
|
+
# @result [Hash, nil] A hash in the format { 'relations' => ..., 'means' => ..., 'medians' => ... }, or nil if no evocations are available.
|
122
|
+
# @raise [NoWordnetConnection] If there is currently no wordnet connection.
|
123
|
+
def evocations(synset_id)
|
124
|
+
|
125
|
+
raise NoWordnetConnection, "There is presently no connection to wordnet. To attempt to reistablish a connection you should use the 'open!' command on the Wordnet object." unless connected?
|
126
|
+
@connection[synset_id + "s"]
|
127
|
+
|
128
|
+
end
|
129
|
+
|
130
|
+
# Provides a textural description of the current connection state of the Wordnet object.
|
131
|
+
#
|
132
|
+
# @return [String] A textural description of the current connection state of the Wordnet object. e.g. "Words not Connected" or "Words running in tokyo mode with dataset at /opt/wordnet"
|
133
|
+
def to_s
|
134
|
+
|
135
|
+
"Words running in tokyo mode with dataset at #{@dataset_path}"
|
136
|
+
|
137
|
+
end
|
138
|
+
|
139
|
+
end
|
140
|
+
|
141
|
+
end
|
data/lib/words.rb
ADDED
@@ -0,0 +1,172 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
# Standard library includes
|
4
|
+
require 'pathname'
|
5
|
+
|
6
|
+
# Local includes
|
7
|
+
require File.join(File.dirname(__FILE__),'homographs.rb')
|
8
|
+
|
9
|
+
# The Words gem namespace. Within this we offer a number of classes to facilitate useful interaction with words and language. Currently this largly consists of Words::Wordnet which offers simple wordnet access.
|
10
|
+
module Words
|
11
|
+
|
12
|
+
# We identify each wordnet connector installed and there paths
|
13
|
+
SUPPORTED_CONNECTIORS = Dir[File.join(File.dirname(__FILE__),'wordnet_connectors','*_wordnet_connection.rb')].inject(Hash.new) { |connectors, connection_file| connectors[ File.basename(connection_file).split('_').first.to_sym ] = connection_file; connectors }
|
14
|
+
# An array of tippical wordnet install locations (if you have a standard install somewhere else please open as an issue in github so we can improve!)
|
15
|
+
DEFAULT_WORDNET_LOCATIONS = ['/usr/share/wordnet/', '/usr/local/share/wordnet/', '/usr/local/WordNet-3.0/', '/opt/WordNet-3.0/', '/opt/wordnet/', '/opt/local/share/WordNet-3.0/', '/usr/local/Cellar/wordnet/3.0/']
|
16
|
+
|
17
|
+
# Exception to indicate that the wordnet connector specified is not currently available/supported.
|
18
|
+
class BadWordnetConnector < RuntimeError; end
|
19
|
+
# Exception to indicate that there is a problem connecting to a specified wordnet dataset.
|
20
|
+
class BadWordnetDataset < RuntimeError; end
|
21
|
+
# Exception to indicate that there is not currently a connection to wordnet and thus any request cannot be fulfilled.
|
22
|
+
class NoWordnetConnection < RuntimeError; end
|
23
|
+
|
24
|
+
# The wordnet class provides a control come interface for interaction with the wordnet dataset of your choice. It creates a connection, based on specified paramaters, to a wordnet dataset and provides
|
25
|
+
# the means to interigate that dataset. In addition it provides control and information about that wordnet connection.
|
26
|
+
class Wordnet
|
27
|
+
|
28
|
+
## Returns the underlying wordnet connection object.
|
29
|
+
#
|
30
|
+
# @return [PureWordnetConnection, TokyoWordnetConnection] the underlying wordnet connection object.
|
31
|
+
attr_reader :wordnet_connection
|
32
|
+
|
33
|
+
# Constructs a new wordnet connection object.
|
34
|
+
#
|
35
|
+
# @param [Symbol] connector_type Specifies the connector type or mode desired. Current supported connectors are :pure and :tokyo.
|
36
|
+
# @param [String, Symbol] wordnet_path Specifies the directory within which the wordnet dictionary can be found. It can be set to :search to attempt to locate wordnet automatically.
|
37
|
+
# @param [String, Symbol] data_path Specifies the directory within which constructed datasets can be found (tokyo index, evocations etc...) It can be set to :default to use the standard location inside the gem directory.
|
38
|
+
# @return [Wordnet] The wordnet connection object.
|
39
|
+
# @raise [BadWordnetConnector] If an invalid connector type is provided.
|
40
|
+
def initialize(connector_type = :pure, wordnet_path = :search, data_path = :default)
|
41
|
+
|
42
|
+
# Check and specify useful paths
|
43
|
+
wordnet_path = Wordnet::locate_wordnet(wordnet_path)
|
44
|
+
data_path = (data_path == :default ? Pathname.new(File.join(File.dirname(__FILE__), '..', 'data')) : Pathname.new( data_path ))
|
45
|
+
|
46
|
+
# Ensure we have a valid connector type
|
47
|
+
raise BadWordnetConnector, "You specified an unsupported wordnet connector type. Supported connectors are: #{SUPPORTED_CONNECTIORS}" unless SUPPORTED_CONNECTIORS.include? connector_type
|
48
|
+
|
49
|
+
# We can assume that the disired connector is now available
|
50
|
+
desired_connector = SUPPORTED_CONNECTIORS[connector_type]
|
51
|
+
|
52
|
+
# Assuming we have a valid connection type we can import the relevant code (the reason we do this dynamically is to reduce loadtime)
|
53
|
+
require desired_connector
|
54
|
+
|
55
|
+
# Construct the connector object
|
56
|
+
@wordnet_connection = Words.const_get( File.basename(desired_connector, '.rb').gsub(/(^|_)(.)/) { $2.upcase } ).new(data_path, wordnet_path)
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
# Locates the set of homographs within wordnet specific to the term entered.
|
61
|
+
#
|
62
|
+
# @param [String] term The specific term that is desired from within wordnet. This is caps insensative & we do a small amount of cleanup.
|
63
|
+
# @return [Homographs] An object encaptulating the homographs of the desired term. If the term cannot be located within wordnet then nil is returned.
|
64
|
+
# @raise [NoWordnetConnection] If there is currently no wordnet connection.
|
65
|
+
def find(term)
|
66
|
+
|
67
|
+
raise NoWordnetConnection, "There is presently no connection to wordnet. To attempt to reistablish a connection you should use the 'open!' command on the Wordnet object." unless connected?
|
68
|
+
homographs = @wordnet_connection.homographs(term)
|
69
|
+
Homographs.new(homographs, @wordnet_connection) unless homographs.nil?
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
# Returns the type of the current wordnet connection.
|
74
|
+
#
|
75
|
+
# @return [Symbol] The current wordnet connection type. Currently supported :pure & :tokyo.
|
76
|
+
def connection_type
|
77
|
+
|
78
|
+
@wordnet_connection.connection_type
|
79
|
+
|
80
|
+
end
|
81
|
+
|
82
|
+
# Returns the path to the wordnet collection currently in use (this may be irrelevent when using the tokyo connector and thus could be nil.)
|
83
|
+
#
|
84
|
+
# @return [Pathname, nil] The path to the wordnet collection currently in use. Returns nil if unknown.
|
85
|
+
def wordnet_path
|
86
|
+
|
87
|
+
@wordnet_connection.wordnet_path
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
# Returns the datapath currently in use (this may be irrelevent when using the pure connector and thus could be nil.)
|
92
|
+
#
|
93
|
+
# @return [Pathname, nil] The path to the data directory currently in use. Returns nil if unknown.
|
94
|
+
def data_path
|
95
|
+
|
96
|
+
@wordnet_connection.data_path
|
97
|
+
|
98
|
+
end
|
99
|
+
|
100
|
+
# Causes the current connection to wordnet to be closed.
|
101
|
+
#
|
102
|
+
def close!
|
103
|
+
|
104
|
+
@wordnet_connection.close!
|
105
|
+
|
106
|
+
end
|
107
|
+
|
108
|
+
# Causes the connection specified within the wordnet object to be reopened if currently closed.
|
109
|
+
#
|
110
|
+
def open!
|
111
|
+
|
112
|
+
@wordnet_connection.open!
|
113
|
+
|
114
|
+
end
|
115
|
+
|
116
|
+
# Returns the current connection status of the wordnet object.
|
117
|
+
#
|
118
|
+
# @return [true, false] The current connection status of the wordnet object.
|
119
|
+
def connected?
|
120
|
+
|
121
|
+
@wordnet_connection.connected?
|
122
|
+
|
123
|
+
end
|
124
|
+
|
125
|
+
# Returns wheter evocations are currently avalable to use with the current wordnet object. (More information on setting these up can be found within the README)
|
126
|
+
#
|
127
|
+
# @return [true, false] Whether evocations are currently available or not.
|
128
|
+
def evocations?
|
129
|
+
|
130
|
+
@wordnet_connection.evocations?
|
131
|
+
|
132
|
+
end
|
133
|
+
|
134
|
+
# Provides a textural description of the current connection state of the Wordnet object.
|
135
|
+
#
|
136
|
+
# @return [String] A textural description of the current connection state of the Wordnet object. e.g. "Words not Connected" or "Words running in pure mode using wordnet files found at /opt/wordnet"
|
137
|
+
def to_s
|
138
|
+
|
139
|
+
# Return a description of the connector
|
140
|
+
!connected? ? "Words not connected" : @wordnet_connection.to_s
|
141
|
+
|
142
|
+
end
|
143
|
+
|
144
|
+
private
|
145
|
+
|
146
|
+
# Attempts to locates wordnet given an array of directories to look within
|
147
|
+
#
|
148
|
+
# @param [String, Array<String>, Symbol] base_dirs Either a path, array of or the :search symbol. Will attempt to locate wordnet within these specified directories.
|
149
|
+
# @return [Pathname, nil] The pathname of the wordnet dictionary files or nil if they can't be located within the passed directorie(s)
|
150
|
+
def self.locate_wordnet(base_dirs)
|
151
|
+
|
152
|
+
base_dirs = case base_dirs
|
153
|
+
when :search
|
154
|
+
DEFAULT_WORDNET_LOCATIONS
|
155
|
+
else
|
156
|
+
[ base_dirs ].flatten
|
157
|
+
end
|
158
|
+
|
159
|
+
base_dirs.each do |dir|
|
160
|
+
["", "dict/"].each do |sub_folder|
|
161
|
+
path = Pathname.new(dir + sub_folder)
|
162
|
+
return path if (path + "data.noun").exist?
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
return nil
|
167
|
+
|
168
|
+
end
|
169
|
+
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|