oald_parser 0.2 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/oald_parser/page_parser.rb +22 -5
- data/lib/oald_parser.rb +1 -1
- metadata +3 -2
@@ -25,8 +25,10 @@ module OaldParser
|
|
25
25
|
class PageParser
|
26
26
|
def parse(page)
|
27
27
|
parsed = Nokogiri::HTML(page)
|
28
|
-
if blocks_on_page?
|
28
|
+
if blocks_on_page? parsed
|
29
29
|
Page.new(parse_block(parsed), [])
|
30
|
+
elsif def_on_page? parsed
|
31
|
+
Page.new([], parse_def(parsed))
|
30
32
|
else
|
31
33
|
Page.new([], parse_items(parsed))
|
32
34
|
end
|
@@ -37,6 +39,10 @@ module OaldParser
|
|
37
39
|
page.css('div.sd-g').first
|
38
40
|
end
|
39
41
|
|
42
|
+
def def_on_page?(page)
|
43
|
+
page.css('div.h-g').first
|
44
|
+
end
|
45
|
+
|
40
46
|
def parse_block(page)
|
41
47
|
block_nodes = page.css('div.sd-g')
|
42
48
|
block_nodes.collect do |block|
|
@@ -50,15 +56,26 @@ module OaldParser
|
|
50
56
|
item_nodes = block.css('span.n-g')
|
51
57
|
item_nodes.collect do |item|
|
52
58
|
item_text = all_except(item, 'x-g')
|
53
|
-
|
54
|
-
examples = example_nodes.collect{|e| e.text.strip}
|
55
|
-
Item.new(item_text, examples)
|
59
|
+
Item.new(item_text, parse_examples(item))
|
56
60
|
end
|
57
61
|
end
|
58
62
|
|
63
|
+
def parse_def(page)
|
64
|
+
item_nodes = page.css('div.h-g')
|
65
|
+
item_nodes.collect do |item|
|
66
|
+
item_text = item.css('div.def_block').first.text.strip
|
67
|
+
Item.new(item_text, parse_examples(item))
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def parse_examples(item)
|
72
|
+
example_nodes = item.css('span.x-g')
|
73
|
+
example_nodes.collect{|e| e.text.strip}
|
74
|
+
end
|
75
|
+
|
59
76
|
def all_except(item, class_name)
|
60
77
|
elements = item.children.find_all do |c|
|
61
|
-
!(c.name
|
78
|
+
!(['span', 'div'].include?(c.name) && c[:class] == class_name)
|
62
79
|
end
|
63
80
|
elements.collect{|e|e.text}.join('').strip
|
64
81
|
end
|
data/lib/oald_parser.rb
CHANGED
@@ -8,7 +8,7 @@ require_relative 'oald_parser/page_parser'
|
|
8
8
|
#include OaldParser
|
9
9
|
#
|
10
10
|
#downloader = PageDownloader.new("http://www.oxfordadvancedlearnersdictionary.com/dictionary")
|
11
|
-
#page = downloader.download("
|
11
|
+
#page = downloader.download("prevent")
|
12
12
|
#parser = PageParser.new
|
13
13
|
#parsed = parser.parse(page)
|
14
14
|
#formatter = Formatter.new(items: 15)
|
metadata
CHANGED
@@ -5,7 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 2
|
8
|
-
|
8
|
+
- 1
|
9
|
+
version: 0.2.1
|
9
10
|
platform: ruby
|
10
11
|
authors:
|
11
12
|
- Victor Savkin
|
@@ -13,7 +14,7 @@ autorequire:
|
|
13
14
|
bindir: bin
|
14
15
|
cert_chain: []
|
15
16
|
|
16
|
-
date: 2010-05-
|
17
|
+
date: 2010-05-11 00:00:00 +11:00
|
17
18
|
default_executable:
|
18
19
|
dependencies:
|
19
20
|
- !ruby/object:Gem::Dependency
|