infoboxer 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +1 -1
- data/lib/infoboxer/media_wiki/traits.rb +2 -1
- data/lib/infoboxer/parser/inline.rb +16 -5
- data/lib/infoboxer/parser/table.rb +23 -19
- data/lib/infoboxer/parser/util.rb +12 -2
- data/lib/infoboxer/tree/table.rb +2 -2
- data/lib/infoboxer/version.rb +1 -1
- data/regression/pages/greece.wiki +919 -0
- data/regression/pages/south_america_new.wiki +652 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 69ee012f6cd8bb3e923289a4a921e81798ffd2aa
|
4
|
+
data.tar.gz: 42686bced509c310051ba7bc895c68d26a96bd2f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cc237b46fe0f59a2ba04d3b53da4a7361cd974e50e1d18f688a077093da664a7d205dd363ddc2622a9af4c1a6f1168315d90fb6867e65c0a61bd60e5e453e81a
|
7
|
+
data.tar.gz: d844446421cffd8e7f6a8fe885f0f1918c772f9e2e7c1d289a1e90b544fde9b3c2b41c8956df7338aab9ad2ae5ca7f5ec9d67fb2d3f16da34367ed5b96a5aa22
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,13 @@
|
|
1
1
|
# Infoboxer's change log
|
2
2
|
|
3
|
+
## 0.2.2 (2016-01-03)
|
4
|
+
|
5
|
+
Fixes:
|
6
|
+
* more sophisticated table parsing;
|
7
|
+
* empty `<nowiki/>` is parsed properly;
|
8
|
+
* inline unclosed markup inside wikilinks works;
|
9
|
+
* `MediaWiki::Traits` can now be continued in several places.
|
10
|
+
|
3
11
|
## 0.2.1 (2015-12-21)
|
4
12
|
|
5
13
|
* `infoboxer` binary properly registered.
|
data/README.md
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
[](https://travis-ci.org/molybdenum-99/infoboxer)
|
5
5
|
[](https://coveralls.io/github/molybdenum-99/infoboxer?branch=master)
|
6
6
|
[](https://codeclimate.com/github/molybdenum-99/infoboxer)
|
7
|
-
[](https://gitter.im/molybdenum-99/infoboxer)
|
8
8
|
|
9
9
|
**Infoboxer** is pure-Ruby Wikipedia (and generic MediaWiki) client and
|
10
10
|
parser, targeting information extraction (hence the name).
|
@@ -59,7 +59,8 @@ module Infoboxer
|
|
59
59
|
# [English Wikipedia traits](https://github.com/molybdenum-99/infoboxer/blob/master/lib/infoboxer/definitions/en.wikipedia.org.rb)
|
60
60
|
# for example implementation.
|
61
61
|
def for(domain, &block)
|
62
|
-
|
62
|
+
Traits.domains[domain].tap{|c| c && c.instance_eval(&block)} ||
|
63
|
+
Class.new(self, &block).domain(domain)
|
63
64
|
end
|
64
65
|
|
65
66
|
# @private
|
@@ -33,8 +33,10 @@ module Infoboxer
|
|
33
33
|
nodes = Nodes[]
|
34
34
|
guarded_loop do
|
35
35
|
# FIXME: quick and UGLY IS HELL JUST TRYING TO MAKE THE SHIT WORK
|
36
|
-
if @context.inline_eol_sign
|
36
|
+
if @context.inline_eol_sign == /^\]/
|
37
37
|
chunk = @context.scan_until(re.short_inline_until_cache_brackets[until_pattern])
|
38
|
+
elsif @context.inline_eol_sign == /^\]\]/
|
39
|
+
chunk = @context.scan_until(re.short_inline_until_cache_brackets2[until_pattern])
|
38
40
|
else
|
39
41
|
chunk = @context.scan_until(re.short_inline_until_cache[until_pattern])
|
40
42
|
end
|
@@ -97,7 +99,7 @@ module Infoboxer
|
|
97
99
|
when '{{'
|
98
100
|
template
|
99
101
|
when /<nowiki([^>]*)>/
|
100
|
-
nowiki
|
102
|
+
nowiki($1)
|
101
103
|
when /<ref([^>]*)\/>/
|
102
104
|
reference($1, true)
|
103
105
|
when /<ref([^>]*)>/
|
@@ -114,7 +116,12 @@ module Infoboxer
|
|
114
116
|
# [[a|b]]
|
115
117
|
def wikilink
|
116
118
|
link = @context.scan_continued_until(/\||\]\]/)
|
117
|
-
|
119
|
+
if @context.matched == '|'
|
120
|
+
@context.push_eol_sign(/^\]\]/)
|
121
|
+
caption = inline(/\]\]/)
|
122
|
+
@context.pop_eol_sign
|
123
|
+
end
|
124
|
+
|
118
125
|
Wikilink.new(link, caption)
|
119
126
|
end
|
120
127
|
|
@@ -136,8 +143,12 @@ module Infoboxer
|
|
136
143
|
Ref.new(children, parse_params(param_str))
|
137
144
|
end
|
138
145
|
|
139
|
-
def nowiki
|
140
|
-
|
146
|
+
def nowiki(tag_rest)
|
147
|
+
if tag_rest.end_with?('/')
|
148
|
+
Text.new('')
|
149
|
+
else
|
150
|
+
Text.new(@context.scan_continued_until(/<\/nowiki>/))
|
151
|
+
end
|
141
152
|
end
|
142
153
|
end
|
143
154
|
|
@@ -52,10 +52,10 @@ module Infoboxer
|
|
52
52
|
table_template(table)
|
53
53
|
|
54
54
|
when nil
|
55
|
-
|
55
|
+
return false
|
56
56
|
|
57
57
|
else
|
58
|
-
table_cell_cont(table)
|
58
|
+
return table_cell_cont(table)
|
59
59
|
end
|
60
60
|
true # should continue parsing
|
61
61
|
end
|
@@ -103,29 +103,33 @@ module Infoboxer
|
|
103
103
|
end
|
104
104
|
end
|
105
105
|
|
106
|
-
#
|
107
|
-
#
|
108
|
-
# {|
|
109
|
-
# <caption>....</caption>
|
110
|
-
#
|
111
|
-
# Solution is NOT elegant or semantically "right", yet it works.
|
112
|
-
# Somehow.
|
106
|
+
# Good news, everyone! Table can be IMPLICITLY closed when it's
|
107
|
+
# not "cell" context.
|
113
108
|
#
|
109
|
+
# Unless it's empty row, which is just skipped.
|
114
110
|
def table_cell_cont(table)
|
115
111
|
container = case (last = table.children.last)
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
112
|
+
when TableRow
|
113
|
+
last.children.last
|
114
|
+
when TableCaption
|
115
|
+
last
|
116
|
+
else
|
117
|
+
nil
|
118
|
+
end
|
119
|
+
|
120
|
+
if !container
|
121
|
+
# return "table not continued" unless row is empty
|
122
|
+
if @context.current.empty?
|
123
|
+
return true
|
124
|
+
else
|
125
|
+
@context.prev!
|
126
|
+
return false
|
127
|
+
end
|
125
128
|
end
|
126
|
-
|
129
|
+
|
127
130
|
container.push_children(paragraph(/^\s*([|!]|{\|)/))
|
128
131
|
table.push_children(container) unless container.parent
|
132
|
+
true
|
129
133
|
end
|
130
134
|
end
|
131
135
|
end
|
@@ -19,13 +19,20 @@ module Infoboxer
|
|
19
19
|
}}
|
20
20
|
)]x
|
21
21
|
|
22
|
-
|
22
|
+
INLINE_EOL_BRACK = %r[(?= # if we have ahead... (not scanned, just checked
|
23
23
|
</ref> | # <ref> closed
|
24
24
|
}} | # or template closed
|
25
25
|
(?<!\])\](?!\]) # or ext.link closed,
|
26
26
|
# the madness with look-ahead/behind means "match single bracket but not double"
|
27
27
|
)]x
|
28
28
|
|
29
|
+
# FIXME: ok, NOW it's officially ridiculous
|
30
|
+
INLINE_EOL_BRACK2 = %r[(?= # if we have ahead... (not scanned, just checked
|
31
|
+
</ref> | # <ref> closed
|
32
|
+
}} | # or template closed
|
33
|
+
\]\] # or int.link closed
|
34
|
+
)]x
|
35
|
+
|
29
36
|
|
30
37
|
def make_regexps
|
31
38
|
{
|
@@ -38,7 +45,10 @@ module Infoboxer
|
|
38
45
|
h[r] = Regexp.union(*[r, INLINE_EOL, FORMATTING, /$/].compact.uniq)
|
39
46
|
},
|
40
47
|
short_inline_until_cache_brackets: Hash.new{|h, r|
|
41
|
-
h[r] = Regexp.union(*[r,
|
48
|
+
h[r] = Regexp.union(*[r, INLINE_EOL_BRACK, FORMATTING, /$/].compact.uniq)
|
49
|
+
},
|
50
|
+
short_inline_until_cache_brackets2: Hash.new{|h, r|
|
51
|
+
h[r] = Regexp.union(*[r, INLINE_EOL_BRACK2, FORMATTING, /$/].compact.uniq)
|
42
52
|
}
|
43
53
|
|
44
54
|
}
|
data/lib/infoboxer/tree/table.rb
CHANGED
@@ -25,13 +25,13 @@ module Infoboxer
|
|
25
25
|
#
|
26
26
|
# FIXME: it can easily be several table heading rows
|
27
27
|
def heading_row
|
28
|
-
rows.first.children.all?(&call(matches?: TableHeading)) ?
|
28
|
+
rows.first && rows.first.children.all?(&call(matches?: TableHeading)) ?
|
29
29
|
rows.first : nil
|
30
30
|
end
|
31
31
|
|
32
32
|
# For now, returns all table rows except {#heading_row}
|
33
33
|
def body_rows
|
34
|
-
rows.first.children.all?(&call(matches?: TableHeading)) ?
|
34
|
+
rows.first && rows.first.children.all?(&call(matches?: TableHeading)) ?
|
35
35
|
rows[1..-1] :
|
36
36
|
rows
|
37
37
|
end
|