nebrija 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +9 -0
- data/bin/nebrija +9 -0
- data/lib/nebrija/parser.rb +75 -0
- data/lib/nebrija.rb +64 -0
- data/test/test_basic.rb +63 -0
- metadata +100 -0
data/Rakefile
ADDED
data/bin/nebrija
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
class Parser
|
4
|
+
|
5
|
+
META_REGEX = /^([a-zA-Z]{1,4}+\.[ ]{1,2})+/
|
6
|
+
|
7
|
+
def initialize(rae_data, word)
|
8
|
+
@doc = Nokogiri::HTML(rae_data
|
9
|
+
.gsub(/[\n]+/, '')
|
10
|
+
.gsub(/[ ]{2,}+/, ' '))
|
11
|
+
@word = word
|
12
|
+
end
|
13
|
+
|
14
|
+
def parse
|
15
|
+
|
16
|
+
return {:error => 'Word does not exist. Sorry.'} if !valid?
|
17
|
+
|
18
|
+
if single?
|
19
|
+
parse_single
|
20
|
+
else
|
21
|
+
parse_multiple
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def single?
|
26
|
+
@doc.css('body > ul').length.zero?
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
def parse_single
|
31
|
+
data = []
|
32
|
+
result = {:id => @doc.css('body > div > a').first['name'].to_i, :data => data}
|
33
|
+
state = :entry # TODO. Improve FSM syntax.
|
34
|
+
index = -1 # HACK(javierhonduco)
|
35
|
+
|
36
|
+
@doc.css('body > div > p').each do |entry|
|
37
|
+
if entry['class'] == 'p' and state == :entry
|
38
|
+
word = entry.css('span').inner_text
|
39
|
+
word = '=>' if word == ''
|
40
|
+
data << {
|
41
|
+
:word => word.gsub(/~/, @word).strip.capitalize,
|
42
|
+
:meanings => []
|
43
|
+
}
|
44
|
+
index+=1
|
45
|
+
else
|
46
|
+
text = entry.inner_text.strip.gsub(/[0-9]+\.[ ]/, '')
|
47
|
+
next if text[0] == '(' # Del latín, Nil.
|
48
|
+
unparsed_meta = text.scan META_REGEX
|
49
|
+
text = text.gsub(META_REGEX, '')
|
50
|
+
data[index][:meanings] << {
|
51
|
+
:word => text,
|
52
|
+
:meta => (unparsed_meta.join.strip if unparsed_meta.join.strip != ''),
|
53
|
+
} if !text.nil? and text != ''
|
54
|
+
state = :definitions
|
55
|
+
end
|
56
|
+
state = :entry
|
57
|
+
end
|
58
|
+
result
|
59
|
+
end
|
60
|
+
|
61
|
+
def parse_multiple
|
62
|
+
multiple_result = []
|
63
|
+
@doc.css('body > ul > li > a').each do |word|
|
64
|
+
multiple_result << {
|
65
|
+
:word => word.css('span').first.inner_text,
|
66
|
+
:href => word['href'].gsub(/search\?id=/, '')
|
67
|
+
}
|
68
|
+
end
|
69
|
+
multiple_result
|
70
|
+
end
|
71
|
+
|
72
|
+
def valid?
|
73
|
+
(@doc.css('title').inner_text =~/error/).nil?
|
74
|
+
end
|
75
|
+
end
|
data/lib/nebrija.rb
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'nebrija/parser'
|
2
|
+
require 'typhoeus'
|
3
|
+
|
4
|
+
|
5
|
+
class Rae
|
6
|
+
|
7
|
+
def search(word)
|
8
|
+
Parser.new(query(word), word).parse
|
9
|
+
end
|
10
|
+
|
11
|
+
private
|
12
|
+
def query(word)
|
13
|
+
raise 'NotImplementedError'
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
class FileRae < Rae
|
19
|
+
|
20
|
+
private
|
21
|
+
def query(file)
|
22
|
+
IO.read(file)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
class HTTPRae < Rae
|
28
|
+
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'
|
29
|
+
SEARCH_URL = 'http://lema.rae.es/drae/srv/search?'
|
30
|
+
REQUEST_TIMEOUT =
|
31
|
+
ID_REGEX = /[0-9]/
|
32
|
+
|
33
|
+
private
|
34
|
+
def query(word)
|
35
|
+
@word = word
|
36
|
+
|
37
|
+
params = 'id='
|
38
|
+
params = 'val=' if val?
|
39
|
+
|
40
|
+
response = Typhoeus::Request.post(
|
41
|
+
"http://lema.rae.es/drae/srv/search?#{params}#{word}",
|
42
|
+
body: build_headers
|
43
|
+
)
|
44
|
+
response.body
|
45
|
+
end
|
46
|
+
|
47
|
+
def val?
|
48
|
+
(@word =~ ID_REGEX).nil?
|
49
|
+
end
|
50
|
+
|
51
|
+
def build_headers
|
52
|
+
{
|
53
|
+
'TS014dfc77_id' => 3,
|
54
|
+
'TS014dfc77_cr' => '42612abd48551544c72ae36bc40f440a%3Akkmj%3AQG60Q2v4%3A1477350835',
|
55
|
+
'TS014dfc77_76' => 0,
|
56
|
+
'TS014dfc77_md' => 1,
|
57
|
+
'TS014dfc77_rf' => 0,
|
58
|
+
'TS014dfc77_ct' => 0,
|
59
|
+
'TS014dfc77_pd' => 0
|
60
|
+
}.map {|key, value|
|
61
|
+
"#{key}=#{value}"
|
62
|
+
}.join('&')
|
63
|
+
end
|
64
|
+
end
|
data/test/test_basic.rb
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'nebrija'
|
3
|
+
|
4
|
+
MOCKS_DIR = "#{Dir.pwd}/test/mocks"
|
5
|
+
|
6
|
+
class TestMockedParserBasic < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def test_error_basic
|
9
|
+
assert_not_nil FileRae.new.search("#{MOCKS_DIR}/error.html")[:error]
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_single_basic
|
13
|
+
assert_not_nil FileRae.new.search("#{MOCKS_DIR}/single.html")[:data]
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_multiple_basic
|
17
|
+
assert FileRae.new.search("#{MOCKS_DIR}/multiple.html").length == 2
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class TestMockedParserContent < Test::Unit::TestCase
|
22
|
+
|
23
|
+
def test_single_basic
|
24
|
+
assert FileRae.new.search("#{MOCKS_DIR}/single.html")[:data].length > 20
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_multiple_basic
|
28
|
+
assert FileRae.new.search("#{MOCKS_DIR}/multiple.html")[0][:word] == 'bancar'
|
29
|
+
assert FileRae.new.search("#{MOCKS_DIR}/multiple.html")[1][:word] == 'banco'
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
class TestMockedParserBasic < Test::Unit::TestCase
|
35
|
+
|
36
|
+
def test_single_basic_id
|
37
|
+
assert_not_nil HTTPRae.new.search('MHpGWYJ6YDXX2bw9Ghwm')[:data]
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_error_basic
|
41
|
+
assert_not_nil HTTPRae.new.search('jddhfgsd')[:error]
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_single_basic
|
45
|
+
assert_not_nil HTTPRae.new.search('a')[:data]
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_multiple_basic
|
49
|
+
assert HTTPRae.new.search('banco').length == 2
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
class TestParserContent < Test::Unit::TestCase
|
54
|
+
|
55
|
+
def test_single_basic
|
56
|
+
assert HTTPRae.new.search('a')[:data].length > 4
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_multiple_basic
|
60
|
+
assert HTTPRae.new.search('banco')[0][:word] == 'bancar'
|
61
|
+
assert HTTPRae.new.search('banco')[1][:word] == 'banco'
|
62
|
+
end
|
63
|
+
end
|
metadata
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: nebrija
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- ! '@javierhonduco'
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2014-07-11 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: typhoeus
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: rake
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
description: A gem to access the rae dictionary
|
63
|
+
email: a@a.a
|
64
|
+
executables:
|
65
|
+
- nebrija
|
66
|
+
extensions: []
|
67
|
+
extra_rdoc_files: []
|
68
|
+
files:
|
69
|
+
- Rakefile
|
70
|
+
- lib/nebrija.rb
|
71
|
+
- lib/nebrija/parser.rb
|
72
|
+
- bin/nebrija
|
73
|
+
- test/test_basic.rb
|
74
|
+
homepage: http://rubygems.org/gems/nebrija
|
75
|
+
licenses:
|
76
|
+
- MIT
|
77
|
+
post_install_message:
|
78
|
+
rdoc_options: []
|
79
|
+
require_paths:
|
80
|
+
- lib
|
81
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
82
|
+
none: false
|
83
|
+
requirements:
|
84
|
+
- - ! '>='
|
85
|
+
- !ruby/object:Gem::Version
|
86
|
+
version: '0'
|
87
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
88
|
+
none: false
|
89
|
+
requirements:
|
90
|
+
- - ! '>='
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: '0'
|
93
|
+
requirements: []
|
94
|
+
rubyforge_project:
|
95
|
+
rubygems_version: 1.8.23
|
96
|
+
signing_key:
|
97
|
+
specification_version: 3
|
98
|
+
summary: dictionary gem and stuff
|
99
|
+
test_files:
|
100
|
+
- test/test_basic.rb
|