marc 0.7.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +7 -18
- data/Rakefile +3 -4
- data/lib/marc/marc8/map_to_unicode.rb +16458 -0
- data/lib/marc/marc8/to_unicode.rb +198 -0
- data/lib/marc/reader.rb +133 -112
- data/lib/marc/version.rb +1 -1
- data/test/bad_eacc_encoding.marc8.marc +1 -0
- data/test/bib178448.okay.human +24 -0
- data/test/bib178448.okay.marc +1 -0
- data/test/bib178448.writtenout.marc +1 -0
- data/test/escaped_character_reference.marc8.marc +1 -0
- data/test/marc8/data/test_marc8.txt +1514 -0
- data/test/marc8/data/test_utf8.txt +1514 -0
- data/test/marc8/tc_marc8_mapping.rb +11 -0
- data/test/marc8/tc_to_unicode.rb +154 -0
- data/test/marc_with_bad_utf8.utf8.human +40 -0
- data/test/marc_with_bad_utf8.utf8.marc +1 -0
- data/test/tc_reader_char_encodings.rb +92 -5
- metadata +61 -15
- data/test/tc_weird_jruby_bytes.rb +0 -62
@@ -1,62 +0,0 @@
|
|
1
|
-
require 'test/unit'
|
2
|
-
|
3
|
-
|
4
|
-
# jruby 1.7.4 (1.9.3p392) 2013-05-16 2390d3b on Java HotSpot(TM) 64-Bit Server VM 1.6.0_51-b11-457-11M4509 [darwin-x86_64]
|
5
|
-
class TestField < Test::Unit::TestCase
|
6
|
-
|
7
|
-
def test_confused_bytecount
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
string_with_ctrl = "hello\x1fhello".force_encoding("UTF-8")
|
14
|
-
# control chars like \x1F ARE legal UTF-8, this is correct:
|
15
|
-
assert string_with_ctrl.valid_encoding?
|
16
|
-
|
17
|
-
# It's even considered ascii_only? -- this is correct, both MRI and jruby
|
18
|
-
assert string_with_ctrl.ascii_only?
|
19
|
-
|
20
|
-
|
21
|
-
# For reasons I can't explain, I can only reproduce the
|
22
|
-
# problem right now by doing a split, on the control char
|
23
|
-
# (this does represent my actual use case)
|
24
|
-
# Whether the split operand is tagged ASCII or UTF-8 does not matter,
|
25
|
-
# case is identical either way.
|
26
|
-
elements = string_with_ctrl.split("\x1F".force_encoding("UTF-8"))
|
27
|
-
|
28
|
-
# For some reason weirdness only happens on the second one in the split
|
29
|
-
# in this case.
|
30
|
-
second = elements[1]
|
31
|
-
|
32
|
-
|
33
|
-
# For a string composed of all one-byte wide ascii, as this one is...
|
34
|
-
assert_equal "hello", second
|
35
|
-
assert second.ascii_only?
|
36
|
-
|
37
|
-
# string[0] and string.byteslice(0) shoudl be identical. They are
|
38
|
-
# different when the string contains multi-byte chars.
|
39
|
-
# using #[], we're okay
|
40
|
-
assert_equal "h", second[0]
|
41
|
-
|
42
|
-
# But on jruby, this following actually raises an exception!
|
43
|
-
assert_equal "h", second.byteslice(0)
|
44
|
-
# That one up there actually just raised!!!
|
45
|
-
# Java::JavaLang::ArrayIndexOutOfBoundsException: 12
|
46
|
-
# org.jruby.util.ByteList.equal(ByteList.java:960)
|
47
|
-
|
48
|
-
# In other cases I saw in my real app, it didn't raise, but
|
49
|
-
# did return the WRONG bytes. Ie, not a 'h' above as expected, or
|
50
|
-
# not:
|
51
|
-
|
52
|
-
|
53
|
-
assert_equal second[0], second.byteslice(0)
|
54
|
-
# but in jruby we never even get here, we raise.
|
55
|
-
|
56
|
-
# In MRI, we pass ALL these tests with no exceptions.
|
57
|
-
# (ruby 1.9.3p448 (2013-06-27 revision 41675) [x86_64-darwin12.4.0])
|
58
|
-
end
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
end
|