iudex-html 1.2.b.0-java → 1.2.b.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.rdoc +6 -0
- data/Manifest.txt +1 -1
- data/bin/iudex-html-clean +113 -25
- data/build/attributes +5 -3
- data/build/tags +113 -98
- data/lib/iudex-html/base.rb +1 -1
- data/lib/iudex-html/{iudex-html-1.2.b.0.jar → iudex-html-1.2.b.1.jar} +0 -0
- data/pom.xml +2 -2
- data/test/test_html_parser.rb +5 -0
- metadata +4 -4
data/History.rdoc
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
=== 1.2.b.1 (2012-5-31)
|
2
|
+
* Add support for HTML 5 (draft) tags, attributes
|
3
|
+
* Neko parser support for HTML 5 <meta charset>
|
4
|
+
* Neko parser keeps non-HTML attributes when skipBanned = false
|
5
|
+
* Add options, barc read support to iudex-html-clean
|
6
|
+
|
1
7
|
=== 1.2.b.0 (2012-3-4)
|
2
8
|
* Upgrade to gravitext-xmlprod ~> 1.5.b
|
3
9
|
* Fix duplicate attributes from Neko, last value wins.
|
data/Manifest.txt
CHANGED
data/bin/iudex-html-clean
CHANGED
@@ -19,40 +19,128 @@
|
|
19
19
|
$LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
|
20
20
|
|
21
21
|
require 'rubygems'
|
22
|
-
require 'rjack-logback'
|
23
22
|
|
24
|
-
|
25
|
-
require '
|
23
|
+
module IudexBinScript
|
24
|
+
require 'rjack-logback'
|
25
|
+
include RJack
|
26
26
|
|
27
|
-
|
27
|
+
Logback.config_console( :stderr => true )
|
28
28
|
|
29
|
-
require '
|
29
|
+
require 'iudex-html'
|
30
|
+
require 'iudex-filter/key_helper'
|
30
31
|
|
31
|
-
|
32
|
-
include Iudex::HTML
|
33
|
-
include Iudex::HTML::Tree
|
34
|
-
include Iudex::HTML::Filters
|
35
|
-
include Iudex::HTML::Tree::Filters
|
32
|
+
require 'gravitext-xmlprod/extensions'
|
36
33
|
|
37
|
-
|
38
|
-
import 'iudex.html.tree.TreeWalker'
|
34
|
+
require 'java'
|
39
35
|
|
40
|
-
|
41
|
-
|
42
|
-
|
36
|
+
class HTMLCleaner
|
37
|
+
include Gravitext::XMLProd
|
38
|
+
include Iudex::Core
|
39
|
+
include Iudex::HTML
|
40
|
+
include Iudex::HTML::Tree
|
41
|
+
include Iudex::HTML::Filters
|
42
|
+
include Iudex::HTML::Tree::Filters
|
43
43
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
EmptyInlineRemover.new ] )
|
44
|
+
import 'iudex.html.HTMLUtils'
|
45
|
+
import 'iudex.html.neko.NekoHTMLParser'
|
46
|
+
import 'iudex.html.tree.TreeWalker'
|
47
|
+
import 'iudex.http.Headers'
|
48
|
+
import 'iudex.util.Charsets'
|
50
49
|
|
51
|
-
|
50
|
+
def initialize
|
51
|
+
@default_encoding = "UTF-8"
|
52
|
+
@trim_non_displayed = false
|
53
|
+
@indentor = Indentor::PRETTY
|
54
|
+
end
|
55
|
+
|
56
|
+
def run( args = ARGV )
|
57
|
+
files = parse_args( args )
|
58
|
+
|
59
|
+
files.each do |f|
|
60
|
+
if f =~ /\.barc$/
|
61
|
+
process_barc( f )
|
62
|
+
else
|
63
|
+
process_file( f )
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
def process_file( fname )
|
70
|
+
|
71
|
+
input = if fname == '-'
|
72
|
+
$stdin.read
|
73
|
+
else
|
74
|
+
IO.read( fname )
|
75
|
+
end
|
76
|
+
|
77
|
+
source = HTMLUtils::source( input.to_java_bytes, @default_encoding )
|
78
|
+
process( source )
|
79
|
+
end
|
80
|
+
|
81
|
+
def process_barc( bname )
|
82
|
+
require 'iudex-barc' #FIXME: Undeclared
|
83
|
+
barc_file = Iudex::BARC::BARCFile.new( java.io.File.new( bname ) )
|
84
|
+
barc_reader = barc_file.reader
|
85
|
+
while( rec = barc_reader.next )
|
86
|
+
next unless rec.type.chr == 'H'
|
87
|
+
source = ContentSource.new( rec.body_input_stream )
|
88
|
+
ctype = Headers.content_type( rec.response_headers )
|
89
|
+
if ctype && ctype.charset
|
90
|
+
enc = Charsets.lookup( ctype.charset )
|
91
|
+
source.set_default_encoding( enc ) if enc
|
92
|
+
end
|
93
|
+
process( source )
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def process( source )
|
98
|
+
parser = NekoHTMLParser.new
|
99
|
+
parser.skip_banned = @trim_non_displayed
|
100
|
+
|
101
|
+
tree = parser.parse( source )
|
102
|
+
|
103
|
+
filters = [ XmpToPreConverter.new,
|
104
|
+
( [ CSSDisplayFilter.new,
|
105
|
+
AttributeCleaner.new ] if @trim_non_displayed ),
|
106
|
+
MojiBakeCleaner.new,
|
107
|
+
CharactersNormalizer.new,
|
108
|
+
EmptyInlineRemover.new ].flatten.compact
|
109
|
+
|
110
|
+
TreeWalker.walk_depth_first( TreeFilterChain.new( filters ),
|
111
|
+
tree )
|
112
|
+
|
113
|
+
puts tree.to_xml( :indentor => @indentor )
|
114
|
+
end
|
115
|
+
|
116
|
+
def parse_args( args = ARGV )
|
117
|
+
parser = OptionParser.new do |opts|
|
118
|
+
opts.banner =
|
119
|
+
"Usage: iudex-html-clean [options] (FILE|barc)...\n" +
|
120
|
+
"Options:\n"
|
121
|
+
|
122
|
+
opts.on( "-v", "--version", "Display version and exit" ) do
|
123
|
+
puts "iudex-html: #{Iudex::HTML::VERSION}"
|
124
|
+
exit 1
|
125
|
+
end
|
126
|
+
|
127
|
+
opts.on( "-t", "--trim-non-display",
|
128
|
+
"Trim banned/non-displayed elements from output" ) do
|
129
|
+
@trim_non_displayed = true
|
130
|
+
end
|
131
|
+
|
132
|
+
opts.on( "-i", "--indentor NAME",
|
133
|
+
"Specify indentor to use for output: " +
|
134
|
+
"PRETTY (default), COMPACT" ) do |name|
|
135
|
+
@indentor = Indentor.const_get( name.upcase.to_sym )
|
136
|
+
end
|
137
|
+
|
138
|
+
end
|
139
|
+
|
140
|
+
parser.parse( args )
|
141
|
+
end
|
52
142
|
|
53
|
-
puts tree.to_xml
|
54
143
|
end
|
55
144
|
|
145
|
+
HTMLCleaner.new.run
|
56
146
|
end
|
57
|
-
|
58
|
-
HTMLCleaner.new.run
|
data/build/attributes
CHANGED
@@ -21,6 +21,7 @@
|
|
21
21
|
# Sources
|
22
22
|
# http://www.w3.org/TR/xhtml11/
|
23
23
|
# http://www.w3.org/TR/html4/
|
24
|
+
# http://dev.w3.org/html5/spec/
|
24
25
|
# http://www.w3schools.com/tags/ref_standardattributes.asp
|
25
26
|
# http://xhtml.com
|
26
27
|
|
@@ -28,6 +29,7 @@ CORE :: ALL except: base head html meta param script style title
|
|
28
29
|
class ,*CORE
|
29
30
|
id ,*CORE
|
30
31
|
style ,*CORE
|
32
|
+
hidden ,*CORE, hidden element
|
31
33
|
title ,CORE, extra title
|
32
34
|
|
33
35
|
LANG :: ALL except: base br frame frameset hr iframe param
|
@@ -40,17 +42,17 @@ content ,meta, text
|
|
40
42
|
scheme ,meta, format URI
|
41
43
|
|
42
44
|
# Anchor and link attributes
|
43
|
-
charset ,a link, char_encoding of link
|
45
|
+
charset ,a link meta, char_encoding of link or (meta) document
|
44
46
|
coords ,*a, coordinates; i.e. image map
|
45
47
|
hreflang ,link, language_code of referent
|
46
48
|
href ,a base link, URL
|
47
|
-
media ,link
|
49
|
+
media ,a area link
|
48
50
|
name ,a, section_name anchor
|
49
51
|
rel ,a link
|
50
52
|
rev ,a link
|
51
53
|
shape ,*a
|
52
54
|
target ,*a *base *link
|
53
|
-
type ,link
|
55
|
+
type ,a link
|
54
56
|
|
55
57
|
# Image and some frame attributes
|
56
58
|
src ,frame img
|
data/build/tags
CHANGED
@@ -17,6 +17,7 @@
|
|
17
17
|
# Sources
|
18
18
|
# http://www.w3.org/TR/xhtml11/
|
19
19
|
# http://www.w3.org/TR/html4/
|
20
|
+
# http://dev.w3.org/html5/spec/
|
20
21
|
# http://www.w3schools.com/tags/default.asp
|
21
22
|
# http://xhtml.com/
|
22
23
|
#
|
@@ -25,106 +26,120 @@
|
|
25
26
|
# S :: In Strict HTML 4.01/XHTML 1.0
|
26
27
|
# T :: In Transitional HTML 4.01/XHTML 1.0
|
27
28
|
# F :: In frameset annex
|
29
|
+
# 5 :: HTML5 new elements
|
28
30
|
# D :: Deprecated
|
29
31
|
# I :: Inline elements (Note <br/> is not labeled inline.)
|
30
32
|
# M :: Metadata elements (content not visible text), i.e. head
|
31
33
|
# B :: Banned/blacklisted elements from which text should not be extracted.
|
32
34
|
|
33
|
-
a , S T F I , anchor
|
34
|
-
abbr , S T F I , abbreviation
|
35
|
-
acronym , S T F
|
36
|
-
address , S T F , contact information for the author or owner
|
37
|
-
applet , T F
|
38
|
-
area ,E S T F , area inside an image-map
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
35
|
+
a , S T F 5 I , anchor
|
36
|
+
abbr , S T F 5 I , abbreviation
|
37
|
+
acronym , S T F I , acronym
|
38
|
+
address , S T F 5 , contact information for the author or owner
|
39
|
+
applet , T F D , embedded applet
|
40
|
+
area ,E S T F 5 , area inside an image-map
|
41
|
+
article , 5 , Structure: An independent content element
|
42
|
+
aside , 5 , Structure: Tengentially related content
|
43
|
+
b , S T F 5 I , bold text
|
44
|
+
base ,E S T F 5 M , default address or target for all links on a page
|
45
|
+
basefont ,E T F D I M , default font; color; or size for the text in a page
|
46
|
+
bdi , 5 I , Text isolated from surrounding for BIDI formatting
|
47
|
+
bdo , S T F 5 I , the text direction
|
48
|
+
big , S T F I , big text
|
49
|
+
blockquote , S T F 5 , long quotation
|
50
|
+
body , S T F 5 , the document's body
|
51
|
+
br ,E S T F 5 , single line break
|
52
|
+
button , S T F 5 I B, push button
|
53
|
+
caption , S T F 5 , table caption
|
54
|
+
center , T F D , centered text
|
55
|
+
cite , S T F 5 I , citation
|
56
|
+
code , S T F 5 I , computer code text
|
57
|
+
col ,E S T F 5 , attribute values for one or more columns in a table
|
58
|
+
colgroup , S T F 5 , group of columns in a table for formatting
|
59
|
+
dd , S T F 5 , description of a term in a definition list
|
60
|
+
del , S T F 5 I , deleted text
|
61
|
+
dfn , S T F 5 I , definition term
|
62
|
+
dir , T F D , directory list
|
63
|
+
div , S T F 5 , section in a document
|
64
|
+
dl , S T F 5 , definition list
|
65
|
+
dt , S T F 5 , term (an item) in a definition list
|
66
|
+
em , S T F 5 I , emphasized text
|
67
|
+
fieldset , S T F 5 B, border around elements in a form
|
68
|
+
figcaption , 5 , Structure: A figure caption
|
69
|
+
figure , 5 , Structure: Self contained content that can be moved.
|
70
|
+
font , T F D I , font; color; or size for text
|
71
|
+
footer , 5 , Structure: A footer of a section
|
72
|
+
form , S T F 5 , form for user input
|
73
|
+
frame ,E F B, window (a frame) in a frameset
|
74
|
+
frameset , F B, set of frames
|
75
|
+
h1 , S T F 5 , heading level 1
|
76
|
+
h2 , S T F 5 , heading level 2
|
77
|
+
h3 , S T F 5 , heading level 3
|
78
|
+
h4 , S T F 5 , heading level 4
|
79
|
+
h5 , S T F 5 , heading level 5
|
80
|
+
h6 , S T F 5 , heading level 6
|
81
|
+
head , S T F 5 M , information about the document
|
82
|
+
header , 5 , Structure: A header of a section
|
83
|
+
hgroup , 5 , Structure: A group of headings
|
84
|
+
hr ,E S T F 5 , horizontal line
|
85
|
+
html , S T F 5 , document
|
86
|
+
i , S T F 5 I , italic text
|
87
|
+
iframe , T F 5 , nline frame
|
88
|
+
img ,E S T F 5 I , image
|
89
|
+
input ,E S T F 5 I B, input control
|
90
|
+
ins , S T F 5 I , inserted text
|
91
|
+
isindex , T F D , searchable index related to a document
|
92
|
+
kbd , S T F 5 I , keyboard text
|
93
|
+
label , S T F 5 I B, label for an input element
|
94
|
+
legend , S T F 5 B, caption for a fieldset element
|
95
|
+
li , S T F 5 , list item
|
96
|
+
link ,E S T F 5 M , relationship between a document and an external resource
|
97
|
+
map , S T F 5 I , image-map
|
98
|
+
mark , 5 I , Text marked/highlighted for reference purposes
|
99
|
+
menu , T F 5 D , menu list
|
100
|
+
meta ,E S T F 5 M , metadata
|
101
|
+
nav , 5 , Structure: container for navigational links
|
102
|
+
noframes , T F B, alternate content where frames not supported
|
103
|
+
noscript , S T F 5 B, alternate content script not supported
|
104
|
+
object , S T F 5 I B, embedded object
|
105
|
+
ol , S T F 5 , ordered list
|
106
|
+
optgroup , S T F 5 B, group of related options in a select list
|
107
|
+
option , S T F 5 B, option in a select list
|
108
|
+
p , S T F 5 , paragraph
|
109
|
+
param ,E S T F 5 , parameter for an object
|
110
|
+
pre , S T F 5 , preformatted text
|
111
|
+
q , S T F 5 I , short quotation
|
112
|
+
rb , 5 , ruby base text
|
113
|
+
rbc , 5 , ruby base container (complex)
|
114
|
+
rp , 5 , ruby simple text container
|
115
|
+
rt , 5 , ruby annotation text
|
116
|
+
rtc , 5 , ruby text container (complex)
|
117
|
+
ruby , 5 I , ruby pronunciation aid
|
118
|
+
s , T F 5 D I , strikethrough text
|
119
|
+
samp , S T F 5 I , sample computer code
|
120
|
+
script , S T F 5 I B, client-side script
|
121
|
+
section , 5 , Structure: generic document/application section
|
122
|
+
select , S T F 5 I B, select list (drop-down list)
|
123
|
+
small , S T F 5 I , small text
|
124
|
+
span , S T F 5 I , section in a document
|
125
|
+
strike , T F D I , strikethrough text
|
126
|
+
strong , S T F 5 I , strong text
|
127
|
+
style , S T F 5 B, style information for a document
|
128
|
+
sub , S T F 5 I , subscripted text
|
129
|
+
sup , S T F 5 I , superscripted text
|
130
|
+
table , S T F 5 , table
|
131
|
+
tbody , S T F 5 , Groups the body content in a table
|
132
|
+
td , S T F 5 , cell in a table
|
133
|
+
textarea , S T F 5 I B, multi-line text input control
|
134
|
+
tfoot , S T F 5 , Groups the footer content in a table
|
135
|
+
th , S T F 5 , header cell in a table
|
136
|
+
thead , S T F 5 , Groups the header content in a table
|
137
|
+
time , 5 I , A date or time
|
138
|
+
title , S T F 5 M , the title of a document
|
139
|
+
tr , S T F 5 , row in a table
|
140
|
+
tt , S T F I , teletype text
|
141
|
+
u , T F 5 D I , underlined text
|
142
|
+
ul , S T F 5 , unordered list
|
143
|
+
var , S T F 5 I , variable part of a text
|
144
|
+
wbr ,E 5 I , A line break opportunity
|
145
|
+
xmp , D , preformatted text
|
data/lib/iudex-html/base.rb
CHANGED
Binary file
|
data/pom.xml
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
<groupId>iudex</groupId>
|
4
4
|
<artifactId>iudex-html</artifactId>
|
5
5
|
<packaging>jar</packaging>
|
6
|
-
<version>1.2.b.
|
6
|
+
<version>1.2.b.1</version>
|
7
7
|
<name>Iudex HTML parsing/filtering and text extraction</name>
|
8
8
|
|
9
9
|
<parent>
|
@@ -24,7 +24,7 @@
|
|
24
24
|
<dependency>
|
25
25
|
<groupId>com.gravitext</groupId>
|
26
26
|
<artifactId>gravitext-xmlprod</artifactId>
|
27
|
-
<version>[1.5,1.5.9999)</version>
|
27
|
+
<version>[1.5.1,1.5.9999)</version>
|
28
28
|
</dependency>
|
29
29
|
|
30
30
|
<dependency>
|
data/test/test_html_parser.rb
CHANGED
@@ -48,6 +48,11 @@ HTML
|
|
48
48
|
assert_doc( alt, parse( alt, "UTF-8" ) )
|
49
49
|
end
|
50
50
|
|
51
|
+
def test_meta_charset_rerun
|
52
|
+
alt = HTML_META.sub( /<meta .*\/>/, '<meta charset="utf-8"/>' )
|
53
|
+
assert_doc( alt, parse( alt, "ISO-8859-1" ) )
|
54
|
+
end
|
55
|
+
|
51
56
|
HTML_SKIP_TAGS = <<HTML
|
52
57
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
53
58
|
<head>
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: iudex-html
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease: 4
|
5
|
-
version: 1.2.b.
|
5
|
+
version: 1.2.b.1
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- David Kellum
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2012-
|
13
|
+
date: 2012-06-01 00:00:00 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: iudex-core
|
@@ -41,7 +41,7 @@ dependencies:
|
|
41
41
|
requirements:
|
42
42
|
- - ~>
|
43
43
|
- !ruby/object:Gem::Version
|
44
|
-
version: 1.5.
|
44
|
+
version: 1.5.1
|
45
45
|
requirement: *id003
|
46
46
|
prerelease: false
|
47
47
|
type: :runtime
|
@@ -117,7 +117,7 @@ files:
|
|
117
117
|
- test/test_stax_parser.rb
|
118
118
|
- test/test_tree_walker.rb
|
119
119
|
- test/test_word_counters.rb
|
120
|
-
- lib/iudex-html/iudex-html-1.2.b.
|
120
|
+
- lib/iudex-html/iudex-html-1.2.b.1.jar
|
121
121
|
homepage: http://github.com/dekellum/iudex
|
122
122
|
licenses: []
|
123
123
|
|